1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <math.h>
14 
15 #include "./aom_dsp_rtcd.h"
16 #include "./av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/blend.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_ports/mem.h"
22 #include "aom_ports/system_state.h"
23 
24 #if CONFIG_CFL
25 #include "av1/common/cfl.h"
26 #endif
27 #include "av1/common/common.h"
28 #include "av1/common/common_data.h"
29 #include "av1/common/entropy.h"
30 #include "av1/common/entropymode.h"
31 #include "av1/common/idct.h"
32 #include "av1/common/mvref_common.h"
33 #include "av1/common/obmc.h"
34 #include "av1/common/pred_common.h"
35 #include "av1/common/quant_common.h"
36 #include "av1/common/reconinter.h"
37 #include "av1/common/reconintra.h"
38 #include "av1/common/scan.h"
39 #include "av1/common/seg_common.h"
40 #if CONFIG_LV_MAP
41 #include "av1/common/txb_common.h"
42 #endif
43 #if CONFIG_WARPED_MOTION
44 #include "av1/common/warped_motion.h"
45 #endif  // CONFIG_WARPED_MOTION
46 
47 #include "av1/encoder/aq_variance.h"
48 #include "av1/encoder/av1_quantize.h"
49 #include "av1/encoder/cost.h"
50 #include "av1/encoder/encodemb.h"
51 #include "av1/encoder/encodemv.h"
52 #include "av1/encoder/encoder.h"
53 #if CONFIG_LV_MAP
54 #include "av1/encoder/encodetxb.h"
55 #endif
56 #include "av1/encoder/hybrid_fwd_txfm.h"
57 #include "av1/encoder/mcomp.h"
58 #include "av1/encoder/palette.h"
59 #include "av1/encoder/ratectrl.h"
60 #include "av1/encoder/rd.h"
61 #include "av1/encoder/rdopt.h"
62 #include "av1/encoder/tokenize.h"
63 #if CONFIG_PVQ
64 #include "av1/encoder/pvq_encoder.h"
65 #include "av1/common/pvq.h"
66 #endif  // CONFIG_PVQ
67 #if CONFIG_DUAL_FILTER
68 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
69 #if USE_EXTRA_FILTER
70 static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
71   { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
72   { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
73   { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
74 };
75 #else   // USE_EXTRA_FILTER
76 static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
77   { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 },
78   { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 },
79 };
80 #endif  // USE_EXTRA_FILTER
81 #endif  // CONFIG_DUAL_FILTER
82 
83 #if CONFIG_EXT_REFS
84 
85 #define LAST_FRAME_MODE_MASK                                          \
86   ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |     \
87    (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
88    (1 << ALTREF_FRAME))
89 #define LAST2_FRAME_MODE_MASK                                         \
90   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) |      \
91    (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
92    (1 << ALTREF_FRAME))
93 #define LAST3_FRAME_MODE_MASK                                         \
94   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |      \
95    (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
96    (1 << ALTREF_FRAME))
97 #define GOLDEN_FRAME_MODE_MASK                                       \
98   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
99    (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
100    (1 << ALTREF_FRAME))
101 #define BWDREF_FRAME_MODE_MASK                                       \
102   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
103    (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF2_FRAME) | \
104    (1 << ALTREF_FRAME))
105 #define ALTREF2_FRAME_MODE_MASK                                     \
106   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
107    (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
108    (1 << ALTREF_FRAME))
109 #define ALTREF_FRAME_MODE_MASK                                      \
110   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
111    (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
112    (1 << ALTREF2_FRAME))
113 
114 #else  // !CONFIG_EXT_REFS
115 
116 #define LAST_FRAME_MODE_MASK \
117   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
118 #define GOLDEN_FRAME_MODE_MASK \
119   ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
120 #define ALTREF_FRAME_MODE_MASK \
121   ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
122 
123 #endif  // CONFIG_EXT_REFS
124 
125 #if CONFIG_EXT_REFS
126 #if CONFIG_EXT_COMP_REFS
127 #define SECOND_REF_FRAME_MASK                                         \
128   ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
129    (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
130 #else  // !CONFIG_EXT_COMP_REFS
131 #define SECOND_REF_FRAME_MASK \
132   ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | 0x01)
133 #endif  // CONFIG_EXT_COMP_REFS
134 #else   // !CONFIG_EXT_REFS
135 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
136 #endif  // CONFIG_EXT_REFS
137 
138 #define MIN_EARLY_TERM_INDEX 3
139 #define NEW_MV_DISCOUNT_FACTOR 8
140 
141 #if CONFIG_EXT_INTRA
142 #define ANGLE_SKIP_THRESH 10
143 #define FILTER_FAST_SEARCH 1
144 #endif  // CONFIG_EXT_INTRA
145 
146 // Setting this to 1 will disable trellis optimization within the
147 // transform search. Trellis optimization will still be applied
148 // in the final encode.
149 #ifndef DISABLE_TRELLISQ_SEARCH
150 #define DISABLE_TRELLISQ_SEARCH 0
151 #endif
152 
153 static const double ADST_FLIP_SVM[8] = {
154   /* vertical */
155   -6.6623, -2.8062, -3.2531, 3.1671,
156   /* horizontal */
157   -7.7051, -3.2234, -3.6193, 3.4533
158 };
159 
160 typedef struct {
161   PREDICTION_MODE mode;
162   MV_REFERENCE_FRAME ref_frame[2];
163 } MODE_DEFINITION;
164 
165 typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
166 
167 struct rdcost_block_args {
168   const AV1_COMP *cpi;
169   MACROBLOCK *x;
170   ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
171   ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
172   RD_STATS rd_stats;
173   int64_t this_rd;
174   int64_t best_rd;
175   int exit_early;
176   int use_fast_coef_costing;
177 };
178 
179 #define LAST_NEW_MV_INDEX 6
180 static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
181   { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
182 #if CONFIG_EXT_REFS
183   { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
184   { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
185   { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
186   { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
187 #endif  // CONFIG_EXT_REFS
188   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
189   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
190 
191   { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
192 
193   { NEWMV, { LAST_FRAME, NONE_FRAME } },
194 #if CONFIG_EXT_REFS
195   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
196   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
197   { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
198   { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
199 #endif  // CONFIG_EXT_REFS
200   { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
201   { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
202 
203   { NEARMV, { LAST_FRAME, NONE_FRAME } },
204 #if CONFIG_EXT_REFS
205   { NEARMV, { LAST2_FRAME, NONE_FRAME } },
206   { NEARMV, { LAST3_FRAME, NONE_FRAME } },
207   { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
208   { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
209 #endif  // CONFIG_EXT_REFS
210   { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
211   { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
212 
213   { ZEROMV, { LAST_FRAME, NONE_FRAME } },
214 #if CONFIG_EXT_REFS
215   { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
216   { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
217   { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
218   { ZEROMV, { ALTREF2_FRAME, NONE_FRAME } },
219 #endif  // CONFIG_EXT_REFS
220   { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
221   { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
222 
223 // TODO(zoeliu): May need to reconsider the order on the modes to check
224 
225 #if CONFIG_COMPOUND_SINGLEREF
226   // Single ref comp mode
227   { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
228 #if CONFIG_EXT_REFS
229   { SR_NEAREST_NEARMV, { LAST2_FRAME, NONE_FRAME } },
230   { SR_NEAREST_NEARMV, { LAST3_FRAME, NONE_FRAME } },
231   { SR_NEAREST_NEARMV, { BWDREF_FRAME, NONE_FRAME } },
232 #endif  // CONFIG_EXT_REFS
233   { SR_NEAREST_NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
234   { SR_NEAREST_NEARMV, { ALTREF_FRAME, NONE_FRAME } },
235 
236   /*
237   { SR_NEAREST_NEWMV, { LAST_FRAME, NONE_FRAME } },
238 #if CONFIG_EXT_REFS
239   { SR_NEAREST_NEWMV, { LAST2_FRAME, NONE_FRAME } },
240   { SR_NEAREST_NEWMV, { LAST3_FRAME, NONE_FRAME } },
241   { SR_NEAREST_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
242 #endif  // CONFIG_EXT_REFS
243   { SR_NEAREST_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
244   { SR_NEAREST_NEWMV, { ALTREF_FRAME, NONE_FRAME } },*/
245 
246   { SR_NEAR_NEWMV, { LAST_FRAME, NONE_FRAME } },
247 #if CONFIG_EXT_REFS
248   { SR_NEAR_NEWMV, { LAST2_FRAME, NONE_FRAME } },
249   { SR_NEAR_NEWMV, { LAST3_FRAME, NONE_FRAME } },
250   { SR_NEAR_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
251 #endif  // CONFIG_EXT_REFS
252   { SR_NEAR_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
253   { SR_NEAR_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
254 
255   { SR_ZERO_NEWMV, { LAST_FRAME, NONE_FRAME } },
256 #if CONFIG_EXT_REFS
257   { SR_ZERO_NEWMV, { LAST2_FRAME, NONE_FRAME } },
258   { SR_ZERO_NEWMV, { LAST3_FRAME, NONE_FRAME } },
259   { SR_ZERO_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
260 #endif  // CONFIG_EXT_REFS
261   { SR_ZERO_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
262   { SR_ZERO_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
263 
264   { SR_NEW_NEWMV, { LAST_FRAME, NONE_FRAME } },
265 #if CONFIG_EXT_REFS
266   { SR_NEW_NEWMV, { LAST2_FRAME, NONE_FRAME } },
267   { SR_NEW_NEWMV, { LAST3_FRAME, NONE_FRAME } },
268   { SR_NEW_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
269 #endif  // CONFIG_EXT_REFS
270   { SR_NEW_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
271   { SR_NEW_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
272 #endif  // CONFIG_COMPOUND_SINGLEREF
273 
274   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
275 #if CONFIG_EXT_REFS
276   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
277   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
278 #endif  // CONFIG_EXT_REFS
279   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
280 #if CONFIG_EXT_REFS
281   { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
282   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
283   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
284   { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
285   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
286   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
287   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
288   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
289 
290 #if CONFIG_EXT_COMP_REFS
291   { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
292   { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
293   { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
294   { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
295 #endif  // CONFIG_EXT_COMP_REFS
296 #endif  // CONFIG_EXT_REFS
297 
298   { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
299 
300   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
301 #if CONFIG_SMOOTH_HV
302   { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
303   { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
304 #endif  // CONFIG_SMOOTH_HV
305 
306   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
307   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
308   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
309   { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
310   { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
311   { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
312   { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
313 
314 #if CONFIG_EXT_REFS
315   { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
316   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
317   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
318   { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
319   { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
320   { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
321   { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
322 
323   { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
324   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
325   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
326   { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
327   { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
328   { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
329   { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
330 #endif  // CONFIG_EXT_REFS
331 
332   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
333   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
334   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
335   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
336   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
337   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
338   { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
339 
340 #if CONFIG_EXT_REFS
341   { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
342   { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
343   { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
344   { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
345   { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
346   { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
347   { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
348 
349   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
350   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
351   { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
352   { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
353   { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
354   { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
355   { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
356 
357   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
358   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
359   { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
360   { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
361   { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
362   { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
363   { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
364 
365   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
366   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
367   { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
368   { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
369   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
370   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
371   { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
372 
373   { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
374   { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
375   { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
376   { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
377   { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
378   { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
379   { ZERO_ZEROMV, { LAST_FRAME, ALTREF2_FRAME } },
380 
381   { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
382   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
383   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
384   { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
385   { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
386   { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
387   { ZERO_ZEROMV, { LAST2_FRAME, ALTREF2_FRAME } },
388 
389   { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
390   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
391   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
392   { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
393   { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
394   { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
395   { ZERO_ZEROMV, { LAST3_FRAME, ALTREF2_FRAME } },
396 
397   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
398   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
399   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
400   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
401   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
402   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
403   { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
404 
405 #if CONFIG_EXT_COMP_REFS
406   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
407   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
408   { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
409   { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
410   { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
411   { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
412   { ZERO_ZEROMV, { LAST_FRAME, LAST2_FRAME } },
413 
414   { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
415   { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
416   { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
417   { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
418   { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
419   { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
420   { ZERO_ZEROMV, { LAST_FRAME, LAST3_FRAME } },
421 
422   { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
423   { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
424   { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
425   { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
426   { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
427   { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
428   { ZERO_ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
429 
430   { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
431   { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
432   { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
433   { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
434   { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
435   { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
436   { ZERO_ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
437 #endif  // CONFIG_EXT_COMP_REFS
438 #endif  // CONFIG_EXT_REFS
439 
440   { H_PRED, { INTRA_FRAME, NONE_FRAME } },
441   { V_PRED, { INTRA_FRAME, NONE_FRAME } },
442   { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
443   { D207_PRED, { INTRA_FRAME, NONE_FRAME } },
444   { D153_PRED, { INTRA_FRAME, NONE_FRAME } },
445   { D63_PRED, { INTRA_FRAME, NONE_FRAME } },
446   { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
447   { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
448 
449   { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
450   { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
451   { NEARMV, { LAST_FRAME, INTRA_FRAME } },
452   { NEWMV, { LAST_FRAME, INTRA_FRAME } },
453 
454 #if CONFIG_EXT_REFS
455   { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
456   { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
457   { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
458   { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
459 
460   { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
461   { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
462   { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
463   { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
464 #endif  // CONFIG_EXT_REFS
465 
466   { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
467   { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
468   { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
469   { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
470 
471 #if CONFIG_EXT_REFS
472   { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
473   { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
474   { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
475   { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
476 
477   { ZEROMV, { ALTREF2_FRAME, INTRA_FRAME } },
478   { NEARESTMV, { ALTREF2_FRAME, INTRA_FRAME } },
479   { NEARMV, { ALTREF2_FRAME, INTRA_FRAME } },
480   { NEWMV, { ALTREF2_FRAME, INTRA_FRAME } },
481 #endif  // CONFIG_EXT_REFS
482 
483   { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
484   { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
485   { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
486   { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
487 };
488 
489 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
490   DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, TM_PRED,
491 #if CONFIG_SMOOTH_HV
492   SMOOTH_V_PRED, SMOOTH_H_PRED,
493 #endif  // CONFIG_SMOOTH_HV
494   D135_PRED,     D207_PRED,     D153_PRED, D63_PRED,    D117_PRED, D45_PRED,
495 };
496 
497 #if CONFIG_CFL
498 static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
499   UV_DC_PRED,       UV_CFL_PRED,      UV_H_PRED,
500   UV_V_PRED,        UV_SMOOTH_PRED,   UV_TM_PRED,
501 #if CONFIG_SMOOTH_HV
502   UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
503 #endif  // CONFIG_SMOOTH_HV
504   UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
505   UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
506 };
507 #else
508 #define uv_rd_search_mode_order intra_rd_search_mode_order
509 #endif  // CONFIG_CFL
510 
write_uniform_cost(int n,int v)511 static INLINE int write_uniform_cost(int n, int v) {
512   const int l = get_unsigned_bits(n);
513   const int m = (1 << l) - n;
514   if (l == 0) return 0;
515   if (v < m)
516     return (l - 1) * av1_cost_bit(128, 0);
517   else
518     return l * av1_cost_bit(128, 0);
519 }
520 
521 // constants for prune 1 and prune 2 decision boundaries
522 #define FAST_EXT_TX_CORR_MID 0.0
523 #define FAST_EXT_TX_EDST_MID 0.1
524 #define FAST_EXT_TX_CORR_MARGIN 0.5
525 #define FAST_EXT_TX_EDST_MARGIN 0.3
526 
pixel_dist_visible_only(const AV1_COMP * const cpi,const MACROBLOCK * x,const uint8_t * src,const int src_stride,const uint8_t * dst,const int dst_stride,const BLOCK_SIZE tx_bsize,int txb_rows,int txb_cols,int visible_rows,int visible_cols)527 static unsigned pixel_dist_visible_only(
528     const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
529     const int src_stride, const uint8_t *dst, const int dst_stride,
530     const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
531     int visible_cols) {
532   unsigned sse;
533 
534   if (txb_rows == visible_rows && txb_cols == visible_cols
535 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
536       && tx_bsize < BLOCK_SIZES
537 #endif
538       ) {
539     cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
540     return sse;
541   }
542 #if CONFIG_HIGHBITDEPTH
543   const MACROBLOCKD *xd = &x->e_mbd;
544 
545   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
546     uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
547                                              visible_cols, visible_rows);
548     return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
549   }
550 #else
551   (void)x;
552 #endif  // CONFIG_HIGHBITDEPTH
553   sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
554                          visible_rows);
555   return sse;
556 }
557 
558 #if CONFIG_DIST_8X8
cdef_dist_8x8_16bit(uint16_t * dst,int dstride,uint16_t * src,int sstride,int coeff_shift)559 static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
560                                     int sstride, int coeff_shift) {
561   uint64_t svar = 0;
562   uint64_t dvar = 0;
563   uint64_t sum_s = 0;
564   uint64_t sum_d = 0;
565   uint64_t sum_s2 = 0;
566   uint64_t sum_d2 = 0;
567   uint64_t sum_sd = 0;
568   uint64_t dist = 0;
569 
570   int i, j;
571   for (i = 0; i < 8; i++) {
572     for (j = 0; j < 8; j++) {
573       sum_s += src[i * sstride + j];
574       sum_d += dst[i * dstride + j];
575       sum_s2 += src[i * sstride + j] * src[i * sstride + j];
576       sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
577       sum_sd += src[i * sstride + j] * dst[i * dstride + j];
578     }
579   }
580   /* Compute the variance -- the calculation cannot go negative. */
581   svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
582   dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
583 
584   // Tuning of jm's original dering distortion metric used in CDEF tool,
585   // suggested by jm
586   const uint64_t a = 4;
587   const uint64_t b = 2;
588   const uint64_t c1 = (400 * a << 2 * coeff_shift);
589   const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
590 
591   dist =
592       (uint64_t)floor(.5 +
593                       (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * (svar + dvar + c1) /
594                           (sqrt(svar * (double)dvar + c2)));
595 
596   // Calibrate dist to have similar rate for the same QP with MSE only
597   // distortion (as in master branch)
598   dist = (uint64_t)((float)dist * 0.75);
599 
600   return dist;
601 }
602 
od_compute_var_4x4(uint16_t * x,int stride)603 static int od_compute_var_4x4(uint16_t *x, int stride) {
604   int sum;
605   int s2;
606   int i;
607   sum = 0;
608   s2 = 0;
609   for (i = 0; i < 4; i++) {
610     int j;
611     for (j = 0; j < 4; j++) {
612       int t;
613 
614       t = x[i * stride + j];
615       sum += t;
616       s2 += t * t;
617     }
618   }
619 
620   return (s2 - (sum * sum >> 4)) >> 4;
621 }
622 
623 /* OD_DIST_LP_MID controls the frequency weighting filter used for computing
624    the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
625    is applied both horizontally and vertically. For X=5, the filter is
626    a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
627 #define OD_DIST_LP_MID (5)
628 #define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
629 
od_compute_dist_8x8(int use_activity_masking,uint16_t * x,uint16_t * y,od_coeff * e_lp,int stride)630 static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
631                                   uint16_t *y, od_coeff *e_lp, int stride) {
632   double sum;
633   int min_var;
634   double mean_var;
635   double var_stat;
636   double activity;
637   double calibration;
638   int i;
639   int j;
640   double vardist;
641 
642   vardist = 0;
643 
644 #if 1
645   min_var = INT_MAX;
646   mean_var = 0;
647   for (i = 0; i < 3; i++) {
648     for (j = 0; j < 3; j++) {
649       int varx;
650       int vary;
651       varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
652       vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
653       min_var = OD_MINI(min_var, varx);
654       mean_var += 1. / (1 + varx);
655       /* The cast to (double) is to avoid an overflow before the sqrt.*/
656       vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
657     }
658   }
659   /* We use a different variance statistic depending on whether activity
660      masking is used, since the harmonic mean appeared slightly worse with
661      masking off. The calibration constant just ensures that we preserve the
662      rate compared to activity=1. */
663   if (use_activity_masking) {
664     calibration = 1.95;
665     var_stat = 9. / mean_var;
666   } else {
667     calibration = 1.62;
668     var_stat = min_var;
669   }
670   /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
671      activity masking constant. */
672   activity = calibration * pow(.25 + var_stat, -1. / 6);
673 #else
674   activity = 1;
675 #endif  // 1
676   sum = 0;
677   for (i = 0; i < 8; i++) {
678     for (j = 0; j < 8; j++)
679       sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
680   }
681   /* Normalize the filter to unit DC response. */
682   sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
683                OD_DIST_LP_NORM);
684   return activity * activity * (sum + vardist);
685 }
686 
687 // Note : Inputs x and y are in a pixel domain
od_compute_dist_common(int activity_masking,uint16_t * x,uint16_t * y,int bsize_w,int bsize_h,int qindex,od_coeff * tmp,od_coeff * e_lp)688 static double od_compute_dist_common(int activity_masking, uint16_t *x,
689                                      uint16_t *y, int bsize_w, int bsize_h,
690                                      int qindex, od_coeff *tmp,
691                                      od_coeff *e_lp) {
692   int i, j;
693   double sum = 0;
694   const int mid = OD_DIST_LP_MID;
695 
696   for (j = 0; j < bsize_w; j++) {
697     e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
698     e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] +
699                                         2 * tmp[(bsize_h - 2) * bsize_w + j];
700   }
701   for (i = 1; i < bsize_h - 1; i++) {
702     for (j = 0; j < bsize_w; j++) {
703       e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
704                               tmp[(i - 1) * bsize_w + j] +
705                               tmp[(i + 1) * bsize_w + j];
706     }
707   }
708   for (i = 0; i < bsize_h; i += 8) {
709     for (j = 0; j < bsize_w; j += 8) {
710       sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j],
711                                  &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
712                                  bsize_w);
713     }
714   }
715   /* Scale according to linear regression against SSE, for 8x8 blocks. */
716   if (activity_masking) {
717     sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
718            (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
719   } else {
720     sum *= qindex >= 128
721                ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
722                : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
723                               : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
724   }
725 
726   return sum;
727 }
728 
od_compute_dist(uint16_t * x,uint16_t * y,int bsize_w,int bsize_h,int qindex)729 static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
730                               int bsize_h, int qindex) {
731   assert(bsize_w >= 8 && bsize_h >= 8);
732 #if CONFIG_PVQ
733   int activity_masking = 1;
734 #else
735   int activity_masking = 0;
736 #endif
737   int i, j;
738   DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
739   DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
740   DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
741   for (i = 0; i < bsize_h; i++) {
742     for (j = 0; j < bsize_w; j++) {
743       e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
744     }
745   }
746   int mid = OD_DIST_LP_MID;
747   for (i = 0; i < bsize_h; i++) {
748     tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
749     tmp[i * bsize_w + bsize_w - 1] =
750         mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
751     for (j = 1; j < bsize_w - 1; j++) {
752       tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
753                              e[i * bsize_w + j + 1];
754     }
755   }
756   return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
757                                 qindex, tmp, e_lp);
758 }
759 
od_compute_dist_diff(uint16_t * x,int16_t * e,int bsize_w,int bsize_h,int qindex)760 static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
761                                    int bsize_h, int qindex) {
762   assert(bsize_w >= 8 && bsize_h >= 8);
763 #if CONFIG_PVQ
764   int activity_masking = 1;
765 #else
766   int activity_masking = 0;
767 #endif
768   DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
769   DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
770   DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
771   int i, j;
772   for (i = 0; i < bsize_h; i++) {
773     for (j = 0; j < bsize_w; j++) {
774       y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
775     }
776   }
777   int mid = OD_DIST_LP_MID;
778   for (i = 0; i < bsize_h; i++) {
779     tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
780     tmp[i * bsize_w + bsize_w - 1] =
781         mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
782     for (j = 1; j < bsize_w - 1; j++) {
783       tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
784                              e[i * bsize_w + j + 1];
785     }
786   }
787   return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
788                                 qindex, tmp, e_lp);
789 }
790 
av1_dist_8x8(const AV1_COMP * const cpi,const MACROBLOCK * x,const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const BLOCK_SIZE tx_bsize,int bsw,int bsh,int visible_w,int visible_h,int qindex)791 int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
792                      const uint8_t *src, int src_stride, const uint8_t *dst,
793                      int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
794                      int bsh, int visible_w, int visible_h, int qindex) {
795   int64_t d = 0;
796   int i, j;
797   const MACROBLOCKD *xd = &x->e_mbd;
798 
799   DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
800   DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
801 
802   assert(bsw >= 8);
803   assert(bsh >= 8);
804   assert((bsw & 0x07) == 0);
805   assert((bsh & 0x07) == 0);
806 
807   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
808       x->tune_metric == AOM_TUNE_DAALA_DIST) {
809 #if CONFIG_HIGHBITDEPTH
810     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
811       for (j = 0; j < bsh; j++)
812         for (i = 0; i < bsw; i++)
813           orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
814 
815       if ((bsw == visible_w) && (bsh == visible_h)) {
816         for (j = 0; j < bsh; j++)
817           for (i = 0; i < bsw; i++)
818             rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
819       } else {
820         for (j = 0; j < visible_h; j++)
821           for (i = 0; i < visible_w; i++)
822             rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
823 
824         if (visible_w < bsw) {
825           for (j = 0; j < bsh; j++)
826             for (i = visible_w; i < bsw; i++)
827               rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
828         }
829 
830         if (visible_h < bsh) {
831           for (j = visible_h; j < bsh; j++)
832             for (i = 0; i < bsw; i++)
833               rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
834         }
835       }
836     } else {
837 #endif
838       for (j = 0; j < bsh; j++)
839         for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
840 
841       if ((bsw == visible_w) && (bsh == visible_h)) {
842         for (j = 0; j < bsh; j++)
843           for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
844       } else {
845         for (j = 0; j < visible_h; j++)
846           for (i = 0; i < visible_w; i++)
847             rec[j * bsw + i] = dst[j * dst_stride + i];
848 
849         if (visible_w < bsw) {
850           for (j = 0; j < bsh; j++)
851             for (i = visible_w; i < bsw; i++)
852               rec[j * bsw + i] = src[j * src_stride + i];
853         }
854 
855         if (visible_h < bsh) {
856           for (j = visible_h; j < bsh; j++)
857             for (i = 0; i < bsw; i++)
858               rec[j * bsw + i] = src[j * src_stride + i];
859         }
860       }
861 #if CONFIG_HIGHBITDEPTH
862     }
863 #endif  // CONFIG_HIGHBITDEPTH
864   }
865 
866   if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
867     d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
868   } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
869     int coeff_shift = AOMMAX(xd->bd - 8, 0);
870 
871     for (i = 0; i < bsh; i += 8) {
872       for (j = 0; j < bsw; j += 8) {
873         d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j],
874                                  bsw, coeff_shift);
875       }
876     }
877 #if CONFIG_HIGHBITDEPTH
878     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
879       d = ((uint64_t)d) >> 2 * coeff_shift;
880 #endif
881   } else {
882     // Otherwise, MSE by default
883     d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
884                                 tx_bsize, bsh, bsw, visible_h, visible_w);
885   }
886 
887   return d;
888 }
889 
av1_dist_8x8_diff(const MACROBLOCK * x,const uint8_t * src,int src_stride,const int16_t * diff,int diff_stride,int bsw,int bsh,int visible_w,int visible_h,int qindex)890 static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
891                                  int src_stride, const int16_t *diff,
892                                  int diff_stride, int bsw, int bsh,
893                                  int visible_w, int visible_h, int qindex) {
894   int64_t d = 0;
895   int i, j;
896   const MACROBLOCKD *xd = &x->e_mbd;
897 
898   DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
899   DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
900 
901   assert(bsw >= 8);
902   assert(bsh >= 8);
903   assert((bsw & 0x07) == 0);
904   assert((bsh & 0x07) == 0);
905 
906   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
907       x->tune_metric == AOM_TUNE_DAALA_DIST) {
908 #if CONFIG_HIGHBITDEPTH
909     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
910       for (j = 0; j < bsh; j++)
911         for (i = 0; i < bsw; i++)
912           orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
913     } else {
914 #endif
915       for (j = 0; j < bsh; j++)
916         for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
917 #if CONFIG_HIGHBITDEPTH
918     }
919 #endif  // CONFIG_HIGHBITDEPTH
920 
921     if ((bsw == visible_w) && (bsh == visible_h)) {
922       for (j = 0; j < bsh; j++)
923         for (i = 0; i < bsw; i++)
924           diff16[j * bsw + i] = diff[j * diff_stride + i];
925     } else {
926       for (j = 0; j < visible_h; j++)
927         for (i = 0; i < visible_w; i++)
928           diff16[j * bsw + i] = diff[j * diff_stride + i];
929 
930       if (visible_w < bsw) {
931         for (j = 0; j < bsh; j++)
932           for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
933       }
934 
935       if (visible_h < bsh) {
936         for (j = visible_h; j < bsh; j++)
937           for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
938       }
939     }
940   }
941 
942   if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
943     d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
944   } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
945     int coeff_shift = AOMMAX(xd->bd - 8, 0);
946     DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]);
947 
948     for (i = 0; i < bsh; i++) {
949       for (j = 0; j < bsw; j++) {
950         dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j];
951       }
952     }
953 
954     for (i = 0; i < bsh; i += 8) {
955       for (j = 0; j < bsw; j += 8) {
956         d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j],
957                                  bsw, coeff_shift);
958       }
959     }
960     // Don't scale 'd' for HBD since it will be done by caller side for diff
961     // input
962   } else {
963     // Otherwise, MSE by default
964     d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h);
965   }
966 
967   return d;
968 }
969 #endif  // CONFIG_DIST_8X8
970 
get_energy_distribution_fine(const AV1_COMP * cpi,BLOCK_SIZE bsize,const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,double * hordist,double * verdist)971 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
972                                          const uint8_t *src, int src_stride,
973                                          const uint8_t *dst, int dst_stride,
974                                          double *hordist, double *verdist) {
975   const int bw = block_size_wide[bsize];
976   const int bh = block_size_high[bsize];
977   unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
978 
979   const int f_index = bsize - BLOCK_16X16;
980   if (f_index < 0) {
981     const int w_shift = bw == 8 ? 1 : 2;
982     const int h_shift = bh == 8 ? 1 : 2;
983 #if CONFIG_HIGHBITDEPTH
984     if (cpi->common.use_highbitdepth) {
985       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
986       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
987       for (int i = 0; i < bh; ++i)
988         for (int j = 0; j < bw; ++j) {
989           const int index = (j >> w_shift) + ((i >> h_shift) << 2);
990           esq[index] +=
991               (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
992               (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
993         }
994     } else {
995 #endif  // CONFIG_HIGHBITDEPTH
996 
997       for (int i = 0; i < bh; ++i)
998         for (int j = 0; j < bw; ++j) {
999           const int index = (j >> w_shift) + ((i >> h_shift) << 2);
1000           esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
1001                         (src[j + i * src_stride] - dst[j + i * dst_stride]);
1002         }
1003 #if CONFIG_HIGHBITDEPTH
1004     }
1005 #endif  // CONFIG_HIGHBITDEPTH
1006   } else {
1007     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
1008     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
1009                             &esq[1]);
1010     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
1011                             &esq[2]);
1012     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
1013                             dst_stride, &esq[3]);
1014     src += bh / 4 * src_stride;
1015     dst += bh / 4 * dst_stride;
1016 
1017     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
1018     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
1019                             &esq[5]);
1020     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
1021                             &esq[6]);
1022     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
1023                             dst_stride, &esq[7]);
1024     src += bh / 4 * src_stride;
1025     dst += bh / 4 * dst_stride;
1026 
1027     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
1028     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
1029                             &esq[9]);
1030     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
1031                             &esq[10]);
1032     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
1033                             dst_stride, &esq[11]);
1034     src += bh / 4 * src_stride;
1035     dst += bh / 4 * dst_stride;
1036 
1037     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
1038     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
1039                             &esq[13]);
1040     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
1041                             &esq[14]);
1042     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
1043                             dst_stride, &esq[15]);
1044   }
1045 
1046   double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
1047                  esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
1048                  esq[12] + esq[13] + esq[14] + esq[15];
1049   if (total > 0) {
1050     const double e_recip = 1.0 / total;
1051     hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
1052     hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
1053     hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
1054     verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
1055     verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
1056     verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
1057   } else {
1058     hordist[0] = verdist[0] = 0.25;
1059     hordist[1] = verdist[1] = 0.25;
1060     hordist[2] = verdist[2] = 0.25;
1061   }
1062 }
1063 
adst_vs_flipadst(const AV1_COMP * cpi,BLOCK_SIZE bsize,const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride)1064 static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
1065                             const uint8_t *src, int src_stride,
1066                             const uint8_t *dst, int dst_stride) {
1067   int prune_bitmask = 0;
1068   double svm_proj_h = 0, svm_proj_v = 0;
1069   double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
1070   get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
1071                                hdist, vdist);
1072 
1073   svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
1074                vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
1075   svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
1076                hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
1077   if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
1078     prune_bitmask |= 1 << FLIPADST_1D;
1079   else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
1080     prune_bitmask |= 1 << ADST_1D;
1081 
1082   if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
1083     prune_bitmask |= 1 << (FLIPADST_1D + 8);
1084   else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
1085     prune_bitmask |= 1 << (ADST_1D + 8);
1086 
1087   return prune_bitmask;
1088 }
1089 
1090 #if CONFIG_EXT_TX
get_horver_correlation(const int16_t * diff,int stride,int w,int h,double * hcorr,double * vcorr)1091 static void get_horver_correlation(const int16_t *diff, int stride, int w,
1092                                    int h, double *hcorr, double *vcorr) {
1093   // Returns hor/ver correlation coefficient
1094   const int num = (h - 1) * (w - 1);
1095   double num_r;
1096   int i, j;
1097   int64_t xy_sum = 0, xz_sum = 0;
1098   int64_t x_sum = 0, y_sum = 0, z_sum = 0;
1099   int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
1100   double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
1101   *hcorr = *vcorr = 1;
1102 
1103   assert(num > 0);
1104   num_r = 1.0 / num;
1105   for (i = 1; i < h; ++i) {
1106     for (j = 1; j < w; ++j) {
1107       const int16_t x = diff[i * stride + j];
1108       const int16_t y = diff[i * stride + j - 1];
1109       const int16_t z = diff[(i - 1) * stride + j];
1110       xy_sum += x * y;
1111       xz_sum += x * z;
1112       x_sum += x;
1113       y_sum += y;
1114       z_sum += z;
1115       x2_sum += x * x;
1116       y2_sum += y * y;
1117       z2_sum += z * z;
1118     }
1119   }
1120   x_var_n = x2_sum - (x_sum * x_sum) * num_r;
1121   y_var_n = y2_sum - (y_sum * y_sum) * num_r;
1122   z_var_n = z2_sum - (z_sum * z_sum) * num_r;
1123   xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
1124   xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
1125   if (x_var_n > 0 && y_var_n > 0) {
1126     *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
1127     *hcorr = *hcorr < 0 ? 0 : *hcorr;
1128   }
1129   if (x_var_n > 0 && z_var_n > 0) {
1130     *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
1131     *vcorr = *vcorr < 0 ? 0 : *vcorr;
1132   }
1133 }
1134 
dct_vs_idtx(const int16_t * diff,int stride,int w,int h)1135 int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
1136   double hcorr, vcorr;
1137   int prune_bitmask = 0;
1138   get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
1139 
1140   if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
1141     prune_bitmask |= 1 << IDTX_1D;
1142   else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
1143     prune_bitmask |= 1 << DCT_1D;
1144 
1145   if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
1146     prune_bitmask |= 1 << (IDTX_1D + 8);
1147   else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
1148     prune_bitmask |= 1 << (DCT_1D + 8);
1149   return prune_bitmask;
1150 }
1151 
1152 // Performance drop: 0.5%, Speed improvement: 24%
prune_two_for_sby(const AV1_COMP * cpi,BLOCK_SIZE bsize,MACROBLOCK * x,const MACROBLOCKD * xd,int adst_flipadst,int dct_idtx)1153 static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
1154                              MACROBLOCK *x, const MACROBLOCKD *xd,
1155                              int adst_flipadst, int dct_idtx) {
1156   int prune = 0;
1157 
1158   if (adst_flipadst) {
1159     const struct macroblock_plane *const p = &x->plane[0];
1160     const struct macroblockd_plane *const pd = &xd->plane[0];
1161     prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
1162                               pd->dst.buf, pd->dst.stride);
1163   }
1164   if (dct_idtx) {
1165     av1_subtract_plane(x, bsize, 0);
1166     const struct macroblock_plane *const p = &x->plane[0];
1167     const int bw = 4 << (b_width_log2_lookup[bsize]);
1168     const int bh = 4 << (b_height_log2_lookup[bsize]);
1169     prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
1170   }
1171 
1172   return prune;
1173 }
1174 #endif  // CONFIG_EXT_TX
1175 
1176 // Performance drop: 0.3%, Speed improvement: 5%
prune_one_for_sby(const AV1_COMP * cpi,BLOCK_SIZE bsize,const MACROBLOCK * x,const MACROBLOCKD * xd)1177 static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
1178                              const MACROBLOCK *x, const MACROBLOCKD *xd) {
1179   const struct macroblock_plane *const p = &x->plane[0];
1180   const struct macroblockd_plane *const pd = &xd->plane[0];
1181   return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
1182                           pd->dst.stride);
1183 }
1184 
1185 #if CONFIG_EXT_TX
1186 // 1D Transforms used in inter set, this needs to be changed if
1187 // ext_tx_used_inter is changed
1188 static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
1189   { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
1190 #if CONFIG_MRC_TX
1191   { 1, 0, 0, 1 },
1192 #endif  // CONFIG_MRC_TX
1193 };
1194 #endif  // CONFIG_EXT_TX
1195 
prune_tx_types(const AV1_COMP * cpi,BLOCK_SIZE bsize,MACROBLOCK * x,const MACROBLOCKD * const xd,int tx_set)1196 static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
1197                           const MACROBLOCKD *const xd, int tx_set) {
1198 #if CONFIG_EXT_TX
1199   const int *tx_set_1D = tx_set >= 0 ? ext_tx_used_inter_1D[tx_set] : NULL;
1200 #else
1201   const int tx_set_1D[TX_TYPES_1D] = { 0 };
1202 #endif  // CONFIG_EXT_TX
1203 
1204   switch (cpi->sf.tx_type_search.prune_mode) {
1205     case NO_PRUNE: return 0; break;
1206     case PRUNE_ONE:
1207       if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
1208         return 0;
1209       return prune_one_for_sby(cpi, bsize, x, xd);
1210       break;
1211 #if CONFIG_EXT_TX
1212     case PRUNE_TWO:
1213       if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
1214         if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
1215         return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
1216       }
1217       if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
1218         return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
1219       return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
1220       break;
1221 #endif  // CONFIG_EXT_TX
1222   }
1223   assert(0);
1224   return 0;
1225 }
1226 
do_tx_type_search(TX_TYPE tx_type,int prune)1227 static int do_tx_type_search(TX_TYPE tx_type, int prune) {
1228 // TODO(sarahparker) implement for non ext tx
1229 #if CONFIG_EXT_TX
1230   return !(((prune >> vtx_tab[tx_type]) & 1) |
1231            ((prune >> (htx_tab[tx_type] + 8)) & 1));
1232 #else
1233   // temporary to avoid compiler warnings
1234   (void)vtx_tab;
1235   (void)htx_tab;
1236   (void)tx_type;
1237   (void)prune;
1238   return 1;
1239 #endif  // CONFIG_EXT_TX
1240 }
1241 
model_rd_from_sse(const AV1_COMP * const cpi,const MACROBLOCKD * const xd,BLOCK_SIZE bsize,int plane,int64_t sse,int * rate,int64_t * dist)1242 static void model_rd_from_sse(const AV1_COMP *const cpi,
1243                               const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
1244                               int plane, int64_t sse, int *rate,
1245                               int64_t *dist) {
1246   const struct macroblockd_plane *const pd = &xd->plane[plane];
1247   const int dequant_shift =
1248 #if CONFIG_HIGHBITDEPTH
1249       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
1250 #endif  // CONFIG_HIGHBITDEPTH
1251                                                     3;
1252 
1253   // Fast approximate the modelling function.
1254   if (cpi->sf.simple_model_rd_from_var) {
1255     const int64_t square_error = sse;
1256     int quantizer = (pd->dequant[1] >> dequant_shift);
1257 
1258     if (quantizer < 120)
1259       *rate = (int)((square_error * (280 - quantizer)) >>
1260                     (16 - AV1_PROB_COST_SHIFT));
1261     else
1262       *rate = 0;
1263     *dist = (square_error * quantizer) >> 8;
1264   } else {
1265     av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
1266                                  pd->dequant[1] >> dequant_shift, rate, dist);
1267   }
1268 
1269   *dist <<= 4;
1270 }
1271 
model_rd_for_sb(const AV1_COMP * const cpi,BLOCK_SIZE bsize,MACROBLOCK * x,MACROBLOCKD * xd,int plane_from,int plane_to,int * out_rate_sum,int64_t * out_dist_sum,int * skip_txfm_sb,int64_t * skip_sse_sb)1272 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
1273                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
1274                             int plane_to, int *out_rate_sum,
1275                             int64_t *out_dist_sum, int *skip_txfm_sb,
1276                             int64_t *skip_sse_sb) {
1277   // Note our transform coeffs are 8 times an orthogonal transform.
1278   // Hence quantizer step is also 8 times. To get effective quantizer
1279   // we need to divide by 8 before sending to modeling function.
1280   int plane;
1281   const int ref = xd->mi[0]->mbmi.ref_frame[0];
1282 
1283   int64_t rate_sum = 0;
1284   int64_t dist_sum = 0;
1285   int64_t total_sse = 0;
1286 
1287   x->pred_sse[ref] = 0;
1288 
1289   for (plane = plane_from; plane <= plane_to; ++plane) {
1290     struct macroblock_plane *const p = &x->plane[plane];
1291     struct macroblockd_plane *const pd = &xd->plane[plane];
1292 #if CONFIG_CHROMA_SUB8X8
1293     const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
1294 #else
1295     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
1296 #endif  // CONFIG_CHROMA_SUB8X8
1297 
1298     unsigned int sse;
1299     int rate;
1300     int64_t dist;
1301 
1302 #if CONFIG_CB4X4
1303     if (x->skip_chroma_rd && plane) continue;
1304 #endif  // CONFIG_CB4X4
1305 
1306     // TODO(geza): Write direct sse functions that do not compute
1307     // variance as well.
1308     cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
1309                        &sse);
1310 
1311     if (plane == 0) x->pred_sse[ref] = sse;
1312 
1313     total_sse += sse;
1314 
1315     model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
1316 
1317     rate_sum += rate;
1318     dist_sum += dist;
1319   }
1320 
1321   *skip_txfm_sb = total_sse == 0;
1322   *skip_sse_sb = total_sse << 4;
1323   *out_rate_sum = (int)rate_sum;
1324   *out_dist_sum = dist_sum;
1325 }
1326 
av1_block_error_c(const tran_low_t * coeff,const tran_low_t * dqcoeff,intptr_t block_size,int64_t * ssz)1327 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
1328                           intptr_t block_size, int64_t *ssz) {
1329   int i;
1330   int64_t error = 0, sqcoeff = 0;
1331 
1332   for (i = 0; i < block_size; i++) {
1333     const int diff = coeff[i] - dqcoeff[i];
1334     error += diff * diff;
1335     sqcoeff += coeff[i] * coeff[i];
1336   }
1337 
1338   *ssz = sqcoeff;
1339   return error;
1340 }
1341 
av1_block_error_fp_c(const int16_t * coeff,const int16_t * dqcoeff,int block_size)1342 int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
1343                              int block_size) {
1344   int i;
1345   int64_t error = 0;
1346 
1347   for (i = 0; i < block_size; i++) {
1348     const int diff = coeff[i] - dqcoeff[i];
1349     error += diff * diff;
1350   }
1351 
1352   return error;
1353 }
1354 
1355 #if CONFIG_HIGHBITDEPTH
av1_highbd_block_error_c(const tran_low_t * coeff,const tran_low_t * dqcoeff,intptr_t block_size,int64_t * ssz,int bd)1356 int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
1357                                  const tran_low_t *dqcoeff, intptr_t block_size,
1358                                  int64_t *ssz, int bd) {
1359   int i;
1360   int64_t error = 0, sqcoeff = 0;
1361   int shift = 2 * (bd - 8);
1362   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
1363 
1364   for (i = 0; i < block_size; i++) {
1365     const int64_t diff = coeff[i] - dqcoeff[i];
1366     error += diff * diff;
1367     sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
1368   }
1369   assert(error >= 0 && sqcoeff >= 0);
1370   error = (error + rounding) >> shift;
1371   sqcoeff = (sqcoeff + rounding) >> shift;
1372 
1373   *ssz = sqcoeff;
1374   return error;
1375 }
1376 #endif  // CONFIG_HIGHBITDEPTH
1377 
1378 #if CONFIG_PVQ
1379 // Without PVQ, av1_block_error_c() return two kind of errors,
1380 // 1) reconstruction (i.e. decoded) error and
1381 // 2) Squared sum of transformed residue (i.e. 'coeff')
1382 // However, if PVQ is enabled, coeff does not keep the transformed residue
1383 // but instead a transformed original is kept.
1384 // Hence, new parameter ref vector (i.e. transformed predicted signal)
1385 // is required to derive the residue signal,
1386 // i.e. coeff - ref = residue (all transformed).
1387 
1388 #if CONFIG_HIGHBITDEPTH
av1_highbd_block_error2_c(const tran_low_t * coeff,const tran_low_t * dqcoeff,const tran_low_t * ref,intptr_t block_size,int64_t * ssz,int bd)1389 static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff,
1390                                          const tran_low_t *dqcoeff,
1391                                          const tran_low_t *ref,
1392                                          intptr_t block_size, int64_t *ssz,
1393                                          int bd) {
1394   int64_t error;
1395   int64_t sqcoeff;
1396   int shift = 2 * (bd - 8);
1397   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
1398   // Use the existing sse codes for calculating distortion of decoded signal:
1399   // i.e. (orig - decoded)^2
1400   // For high bit depth, throw away ssz until a 32-bit version of
1401   // av1_block_error_fp is written.
1402   int64_t ssz_trash;
1403   error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
1404   // prediction residue^2 = (orig - ref)^2
1405   sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash);
1406   error = (error + rounding) >> shift;
1407   sqcoeff = (sqcoeff + rounding) >> shift;
1408   *ssz = sqcoeff;
1409   return error;
1410 }
1411 #else
1412 // TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
1413 // a separate function that does not do the extra computations for ssz.
av1_block_error2_c(const tran_low_t * coeff,const tran_low_t * dqcoeff,const tran_low_t * ref,intptr_t block_size,int64_t * ssz)1414 static int64_t av1_block_error2_c(const tran_low_t *coeff,
1415                                   const tran_low_t *dqcoeff,
1416                                   const tran_low_t *ref, intptr_t block_size,
1417                                   int64_t *ssz) {
1418   int64_t error;
1419   int64_t ssz_trash;
1420   // Use the existing sse codes for calculating distortion of decoded signal:
1421   // i.e. (orig - decoded)^2
1422   error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
1423   // prediction residue^2 = (orig - ref)^2
1424   *ssz = av1_block_error(coeff, ref, block_size, &ssz_trash);
1425   return error;
1426 }
1427 #endif  // CONFIG_HIGHBITDEPTH
1428 #endif  // CONFIG_PVQ
1429 
1430 #if !CONFIG_PVQ || CONFIG_VAR_TX
1431 #if !CONFIG_LV_MAP
cost_coeffs(const AV1_COMMON * const cm,MACROBLOCK * x,int plane,int block,TX_SIZE tx_size,const SCAN_ORDER * scan_order,const ENTROPY_CONTEXT * a,const ENTROPY_CONTEXT * l,int use_fast_coef_costing)1432 static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
1433                        int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
1434                        const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
1435                        int use_fast_coef_costing) {
1436   MACROBLOCKD *const xd = &x->e_mbd;
1437   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1438   const struct macroblock_plane *p = &x->plane[plane];
1439   const struct macroblockd_plane *pd = &xd->plane[plane];
1440   const PLANE_TYPE type = pd->plane_type;
1441   const uint16_t *band_count = &band_count_table[tx_size][1];
1442   const int eob = p->eobs[block];
1443   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
1444   const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
1445   uint8_t token_cache[MAX_TX_SQUARE];
1446   int pt = combine_entropy_contexts(*a, *l);
1447   int c, cost;
1448   const int16_t *scan = scan_order->scan;
1449   const int16_t *nb = scan_order->neighbors;
1450   const int ref = is_inter_block(mbmi);
1451   int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
1452       x->token_head_costs[tx_size_ctx][type][ref];
1453   int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
1454       x->token_tail_costs[tx_size_ctx][type][ref];
1455   const int seg_eob = av1_get_tx_eob(&cm->seg, mbmi->segment_id, tx_size);
1456   int eob_val;
1457 
1458 #if CONFIG_HIGHBITDEPTH
1459   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
1460 #else
1461   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
1462 #endif  // CONFIG_HIGHBITDEPTH
1463 
1464 #if !CONFIG_VAR_TX && !CONFIG_SUPERTX
1465   // Check for consistency of tx_size with mode info
1466   assert(tx_size == av1_get_tx_size(plane, xd));
1467 #endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
1468   (void)cm;
1469 
1470   if (eob == 0) {
1471     // block zero
1472     cost = (*head_token_costs)[pt][0];
1473   } else {
1474     if (use_fast_coef_costing) {
1475       int band_left = *band_count++;
1476 
1477       // dc token
1478       int v = qcoeff[0];
1479       int16_t prev_t;
1480       cost = av1_get_token_cost(v, &prev_t, cat6_bits);
1481       eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
1482       cost += av1_get_coeff_token_cost(
1483           prev_t, eob_val, 1, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
1484 
1485       token_cache[0] = av1_pt_energy_class[prev_t];
1486       ++head_token_costs;
1487       ++tail_token_costs;
1488 
1489       // ac tokens
1490       for (c = 1; c < eob; c++) {
1491         const int rc = scan[c];
1492         int16_t t;
1493 
1494         v = qcoeff[rc];
1495         cost += av1_get_token_cost(v, &t, cat6_bits);
1496         eob_val =
1497             (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
1498         cost += av1_get_coeff_token_cost(t, eob_val, 0,
1499                                          (*head_token_costs)[!prev_t],
1500                                          (*tail_token_costs)[!prev_t]);
1501         prev_t = t;
1502         if (!--band_left) {
1503           band_left = *band_count++;
1504           ++head_token_costs;
1505           ++tail_token_costs;
1506         }
1507       }
1508     } else {  // !use_fast_coef_costing
1509       int band_left = *band_count++;
1510 
1511       // dc token
1512       int v = qcoeff[0];
1513       int16_t tok;
1514       cost = av1_get_token_cost(v, &tok, cat6_bits);
1515       eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
1516       cost += av1_get_coeff_token_cost(tok, eob_val, 1, (*head_token_costs)[pt],
1517                                        (*tail_token_costs)[pt]);
1518 
1519       token_cache[0] = av1_pt_energy_class[tok];
1520       ++head_token_costs;
1521       ++tail_token_costs;
1522 
1523       // ac tokens
1524       for (c = 1; c < eob; c++) {
1525         const int rc = scan[c];
1526 
1527         v = qcoeff[rc];
1528         cost += av1_get_token_cost(v, &tok, cat6_bits);
1529         pt = get_coef_context(nb, token_cache, c);
1530         eob_val =
1531             (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
1532         cost += av1_get_coeff_token_cost(
1533             tok, eob_val, 0, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
1534         token_cache[rc] = av1_pt_energy_class[tok];
1535         if (!--band_left) {
1536           band_left = *band_count++;
1537           ++head_token_costs;
1538           ++tail_token_costs;
1539         }
1540       }
1541     }
1542   }
1543 
1544   return cost;
1545 }
1546 #endif  // !CONFIG_LV_MAP
1547 
av1_cost_coeffs(const AV1_COMP * const cpi,MACROBLOCK * x,int plane,int blk_row,int blk_col,int block,TX_SIZE tx_size,const SCAN_ORDER * scan_order,const ENTROPY_CONTEXT * a,const ENTROPY_CONTEXT * l,int use_fast_coef_costing)1548 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
1549                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
1550                     const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
1551                     const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
1552   const AV1_COMMON *const cm = &cpi->common;
1553 #if !CONFIG_LV_MAP
1554   (void)blk_row;
1555   (void)blk_col;
1556 #if CONFIG_MRC_TX
1557   const MACROBLOCKD *xd = &x->e_mbd;
1558   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1559   const TX_TYPE tx_type = av1_get_tx_type(xd->plane[plane].plane_type, xd,
1560                                           blk_row, blk_col, block, tx_size);
1561   const int is_inter = is_inter_block(mbmi);
1562   if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
1563                              (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
1564     const int mrc_mask_cost =
1565         av1_cost_color_map(x, plane, block, mbmi->sb_type, tx_size, MRC_MAP);
1566     return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
1567                        use_fast_coef_costing) +
1568            mrc_mask_cost;
1569   }
1570 #endif
1571   return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
1572                      use_fast_coef_costing);
1573 #else  // !CONFIG_LV_MAP
1574   (void)scan_order;
1575   (void)use_fast_coef_costing;
1576   const MACROBLOCKD *xd = &x->e_mbd;
1577   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1578   const struct macroblockd_plane *pd = &xd->plane[plane];
1579   const BLOCK_SIZE bsize = mbmi->sb_type;
1580 #if CONFIG_CHROMA_SUB8X8
1581   const BLOCK_SIZE plane_bsize =
1582       AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
1583 #elif CONFIG_CB4X4
1584   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
1585 #else   // CONFIG_CB4X4
1586   const BLOCK_SIZE plane_bsize =
1587       get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
1588 #endif  // CONFIG_CB4X4
1589 
1590   TXB_CTX txb_ctx;
1591   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
1592   return av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, tx_size,
1593                              &txb_ctx);
1594 #endif  // !CONFIG_LV_MAP
1595 }
1596 #endif  // !CONFIG_PVQ || CONFIG_VAR_TX
1597 
1598 // Get transform block visible dimensions cropped to the MI units.
get_txb_dimensions(const MACROBLOCKD * xd,int plane,BLOCK_SIZE plane_bsize,int blk_row,int blk_col,BLOCK_SIZE tx_bsize,int * width,int * height,int * visible_width,int * visible_height)1599 static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
1600                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
1601                                BLOCK_SIZE tx_bsize, int *width, int *height,
1602                                int *visible_width, int *visible_height) {
1603 #if !(CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX))
1604   assert(tx_bsize <= plane_bsize);
1605 #endif
1606   int txb_height = block_size_high[tx_bsize];
1607   int txb_width = block_size_wide[tx_bsize];
1608   const int block_height = block_size_high[plane_bsize];
1609   const int block_width = block_size_wide[plane_bsize];
1610   const struct macroblockd_plane *const pd = &xd->plane[plane];
1611   // TODO(aconverse@google.com): Investigate using crop_width/height here rather
1612   // than the MI size
1613   const int block_rows =
1614       (xd->mb_to_bottom_edge >= 0)
1615           ? block_height
1616           : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
1617   const int block_cols =
1618       (xd->mb_to_right_edge >= 0)
1619           ? block_width
1620           : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
1621   const int tx_unit_size = tx_size_wide_log2[0];
1622   if (width) *width = txb_width;
1623   if (height) *height = txb_height;
1624   *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
1625   *visible_height =
1626       clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
1627 }
1628 
1629 // Compute the pixel domain distortion from src and dst on all visible 4x4s in
1630 // the
1631 // transform block.
pixel_dist(const AV1_COMP * const cpi,const MACROBLOCK * x,int plane,const uint8_t * src,const int src_stride,const uint8_t * dst,const int dst_stride,int blk_row,int blk_col,const BLOCK_SIZE plane_bsize,const BLOCK_SIZE tx_bsize)1632 static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
1633                            int plane, const uint8_t *src, const int src_stride,
1634                            const uint8_t *dst, const int dst_stride,
1635                            int blk_row, int blk_col,
1636                            const BLOCK_SIZE plane_bsize,
1637                            const BLOCK_SIZE tx_bsize) {
1638   int txb_rows, txb_cols, visible_rows, visible_cols;
1639   const MACROBLOCKD *xd = &x->e_mbd;
1640 
1641   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
1642                      &txb_cols, &txb_rows, &visible_cols, &visible_rows);
1643   assert(visible_rows > 0);
1644   assert(visible_cols > 0);
1645 
1646 #if CONFIG_DIST_8X8
1647   if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
1648     return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
1649                                   tx_bsize, txb_cols, txb_rows, visible_cols,
1650                                   visible_rows, x->qindex);
1651 #endif  // CONFIG_DIST_8X8
1652 
1653   unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
1654                                          dst_stride, tx_bsize, txb_rows,
1655                                          txb_cols, visible_rows, visible_cols);
1656 
1657   return sse;
1658 }
1659 
1660 // Compute the pixel domain distortion from diff on all visible 4x4s in the
1661 // transform block.
pixel_diff_dist(const MACROBLOCK * x,int plane,const int16_t * diff,const int diff_stride,int blk_row,int blk_col,const BLOCK_SIZE plane_bsize,const BLOCK_SIZE tx_bsize)1662 static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
1663                                const int16_t *diff, const int diff_stride,
1664                                int blk_row, int blk_col,
1665                                const BLOCK_SIZE plane_bsize,
1666                                const BLOCK_SIZE tx_bsize) {
1667   int visible_rows, visible_cols;
1668   const MACROBLOCKD *xd = &x->e_mbd;
1669 #if CONFIG_DIST_8X8
1670   int txb_height = block_size_high[tx_bsize];
1671   int txb_width = block_size_wide[tx_bsize];
1672   const int src_stride = x->plane[plane].src.stride;
1673   const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
1674   const uint8_t *src = &x->plane[plane].src.buf[src_idx];
1675 #endif
1676 
1677   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
1678                      NULL, &visible_cols, &visible_rows);
1679 
1680 #if CONFIG_DIST_8X8
1681   if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
1682     return av1_dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
1683                              txb_height, visible_cols, visible_rows, x->qindex);
1684   else
1685 #endif
1686     return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols,
1687                                   visible_rows);
1688 }
1689 
av1_count_colors(const uint8_t * src,int stride,int rows,int cols)1690 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
1691   int val_count[256];
1692   memset(val_count, 0, sizeof(val_count));
1693   for (int r = 0; r < rows; ++r) {
1694     for (int c = 0; c < cols; ++c) {
1695       ++val_count[src[r * stride + c]];
1696     }
1697   }
1698   int n = 0;
1699   for (int i = 0; i < 256; ++i) {
1700     if (val_count[i]) ++n;
1701   }
1702   return n;
1703 }
1704 
1705 #if CONFIG_HIGHBITDEPTH
av1_count_colors_highbd(const uint8_t * src8,int stride,int rows,int cols,int bit_depth)1706 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
1707                             int bit_depth) {
1708   assert(bit_depth <= 12);
1709   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1710   int val_count[1 << 12];
1711   memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
1712   for (int r = 0; r < rows; ++r) {
1713     for (int c = 0; c < cols; ++c) {
1714       ++val_count[src[r * stride + c]];
1715     }
1716   }
1717   int n = 0;
1718   for (int i = 0; i < (1 << bit_depth); ++i) {
1719     if (val_count[i]) ++n;
1720   }
1721   return n;
1722 }
1723 #endif  // CONFIG_HIGHBITDEPTH
1724 
av1_dist_block(const AV1_COMP * cpi,MACROBLOCK * x,int plane,BLOCK_SIZE plane_bsize,int block,int blk_row,int blk_col,TX_SIZE tx_size,int64_t * out_dist,int64_t * out_sse,OUTPUT_STATUS output_status)1725 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
1726                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
1727                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
1728                     OUTPUT_STATUS output_status) {
1729   MACROBLOCKD *const xd = &x->e_mbd;
1730   const struct macroblock_plane *const p = &x->plane[plane];
1731 #if CONFIG_DIST_8X8
1732   struct macroblockd_plane *const pd = &xd->plane[plane];
1733 #else   // CONFIG_DIST_8X8
1734   const struct macroblockd_plane *const pd = &xd->plane[plane];
1735 #endif  // CONFIG_DIST_8X8
1736 
1737   if (cpi->sf.use_transform_domain_distortion
1738 #if CONFIG_DIST_8X8
1739       && !x->using_dist_8x8
1740 #endif
1741       ) {
1742     // Transform domain distortion computation is more efficient as it does
1743     // not involve an inverse transform, but it is less accurate.
1744     const int buffer_length = tx_size_2d[tx_size];
1745     int64_t this_sse;
1746     int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
1747     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
1748     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
1749 #if CONFIG_PVQ
1750     tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
1751 
1752 #if CONFIG_HIGHBITDEPTH
1753     const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
1754     *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
1755                                           buffer_length, &this_sse, bd);
1756 #else
1757     *out_dist =
1758         av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, &this_sse);
1759 #endif  // CONFIG_HIGHBITDEPTH
1760 #else   // !CONFIG_PVQ
1761 #if CONFIG_HIGHBITDEPTH
1762     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
1763       *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
1764                                          &this_sse, xd->bd);
1765     else
1766 #endif
1767       *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
1768 #endif  // CONFIG_PVQ
1769     *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
1770     *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
1771   } else {
1772     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
1773 #if !CONFIG_PVQ || CONFIG_DIST_8X8
1774     const int bsw = block_size_wide[tx_bsize];
1775     const int bsh = block_size_high[tx_bsize];
1776 #endif
1777     const int src_stride = x->plane[plane].src.stride;
1778     const int dst_stride = xd->plane[plane].dst.stride;
1779     // Scale the transform block index to pixel unit.
1780     const int src_idx = (blk_row * src_stride + blk_col)
1781                         << tx_size_wide_log2[0];
1782     const int dst_idx = (blk_row * dst_stride + blk_col)
1783                         << tx_size_wide_log2[0];
1784     const uint8_t *src = &x->plane[plane].src.buf[src_idx];
1785     const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
1786     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
1787     const uint16_t eob = p->eobs[block];
1788 
1789     assert(cpi != NULL);
1790     assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
1791 
1792     {
1793       const int diff_stride = block_size_wide[plane_bsize];
1794       const int diff_idx = (blk_row * diff_stride + blk_col)
1795                            << tx_size_wide_log2[0];
1796       const int16_t *diff = &p->src_diff[diff_idx];
1797       *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
1798                                  plane_bsize, tx_bsize);
1799 #if CONFIG_HIGHBITDEPTH
1800       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
1801         *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
1802 #endif  // CONFIG_HIGHBITDEPTH
1803     }
1804     *out_sse *= 16;
1805 
1806     if (eob) {
1807       if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
1808         *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride,
1809                                blk_row, blk_col, plane_bsize, tx_bsize);
1810       } else {
1811 #if CONFIG_HIGHBITDEPTH
1812         uint8_t *recon;
1813         DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
1814 
1815         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
1816           recon = CONVERT_TO_BYTEPTR(recon16);
1817         else
1818           recon = (uint8_t *)recon16;
1819 #else
1820         DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
1821 #endif  // CONFIG_HIGHBITDEPTH
1822 
1823 #if !CONFIG_PVQ
1824 #if CONFIG_HIGHBITDEPTH
1825         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1826           aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
1827                                    NULL, 0, bsw, bsh, xd->bd);
1828         } else {
1829 #endif  // CONFIG_HIGHBITDEPTH
1830           aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL,
1831                             0, bsw, bsh);
1832 #if CONFIG_HIGHBITDEPTH
1833         }
1834 #endif  // CONFIG_HIGHBITDEPTH
1835 #else
1836         (void)dst;
1837 #endif  // !CONFIG_PVQ
1838 
1839 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
1840         uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
1841 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
1842         const PLANE_TYPE plane_type = get_plane_type(plane);
1843         TX_TYPE tx_type =
1844             av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
1845         av1_inverse_transform_block(xd, dqcoeff,
1846 #if CONFIG_LGT_FROM_PRED
1847                                     xd->mi[0]->mbmi.mode,
1848 #endif
1849 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
1850                                     mrc_mask,
1851 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
1852                                     tx_type, tx_size, recon, MAX_TX_SIZE, eob);
1853 
1854 #if CONFIG_DIST_8X8
1855         if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
1856           // Save decoded pixels for inter block in pd->pred to avoid
1857           // block_8x8_rd_txfm_daala_dist() need to produce them
1858           // by calling av1_inverse_transform_block() again.
1859           const int pred_stride = block_size_wide[plane_bsize];
1860           const int pred_idx = (blk_row * pred_stride + blk_col)
1861                                << tx_size_wide_log2[0];
1862           int16_t *pred = &pd->pred[pred_idx];
1863           int i, j;
1864 
1865 #if CONFIG_HIGHBITDEPTH
1866           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1867             for (j = 0; j < bsh; j++)
1868               for (i = 0; i < bsw; i++)
1869                 pred[j * pred_stride + i] =
1870                     CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
1871           } else {
1872 #endif
1873             for (j = 0; j < bsh; j++)
1874               for (i = 0; i < bsw; i++)
1875                 pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
1876 #if CONFIG_HIGHBITDEPTH
1877           }
1878 #endif  // CONFIG_HIGHBITDEPTH
1879         }
1880 #endif  // CONFIG_DIST_8X8
1881         *out_dist =
1882             pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
1883                        blk_row, blk_col, plane_bsize, tx_bsize);
1884       }
1885       *out_dist *= 16;
1886     } else {
1887       *out_dist = *out_sse;
1888     }
1889   }
1890 }
1891 
block_rd_txfm(int plane,int block,int blk_row,int blk_col,BLOCK_SIZE plane_bsize,TX_SIZE tx_size,void * arg)1892 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
1893                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
1894   struct rdcost_block_args *args = arg;
1895   MACROBLOCK *const x = args->x;
1896   MACROBLOCKD *const xd = &x->e_mbd;
1897   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1898   const AV1_COMP *cpi = args->cpi;
1899   ENTROPY_CONTEXT *a = args->t_above + blk_col;
1900   ENTROPY_CONTEXT *l = args->t_left + blk_row;
1901   const AV1_COMMON *cm = &cpi->common;
1902   int64_t rd1, rd2, rd;
1903   RD_STATS this_rd_stats;
1904 
1905 #if CONFIG_DIST_8X8
1906   // If sub8x8 tx, 8x8 or larger partition, and luma channel,
1907   // dist-8x8 disables early skip, because the distortion metrics for
1908   // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
1909   // (new distortion metric) are different.
1910   // Exception is: dist-8x8 is enabled but still MSE is used,
1911   // i.e. "--tune=" encoder option is not used.
1912   int disable_early_skip =
1913       x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
1914       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
1915       x->tune_metric != AOM_TUNE_PSNR;
1916 #endif  // CONFIG_DIST_8X8
1917 
1918 #if !CONFIG_SUPERTX && !CONFIG_VAR_TX
1919   assert(tx_size == av1_get_tx_size(plane, xd));
1920 #endif  // !CONFIG_SUPERTX
1921 
1922   av1_init_rd_stats(&this_rd_stats);
1923 
1924   if (args->exit_early) return;
1925 
1926   if (!is_inter_block(mbmi)) {
1927     av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
1928                                    tx_size);
1929     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
1930   }
1931 
1932 #if !CONFIG_TXK_SEL
1933   // full forward transform and quantization
1934   const int coeff_ctx = combine_entropy_contexts(*a, *l);
1935 #if DISABLE_TRELLISQ_SEARCH
1936   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
1937                   coeff_ctx, AV1_XFORM_QUANT_B);
1938 #else
1939   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
1940                   coeff_ctx, AV1_XFORM_QUANT_FP);
1941 
1942   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
1943   tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
1944   tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
1945   const int buffer_length = tx_size_2d[tx_size];
1946   int64_t tmp_dist;
1947   int64_t tmp;
1948 #if CONFIG_HIGHBITDEPTH
1949   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
1950     tmp_dist =
1951         av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
1952   else
1953 #endif
1954     tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
1955   tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
1956 
1957   if (
1958 #if CONFIG_DIST_8X8
1959       disable_early_skip ||
1960 #endif
1961       RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
1962     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
1963                    a, l, 1);
1964   } else {
1965     args->exit_early = 1;
1966     return;
1967   }
1968 #endif  // DISABLE_TRELLISQ_SEARCH
1969 
1970 #if CONFIG_MRC_TX
1971   if (mbmi->tx_type == MRC_DCT && !mbmi->valid_mrc_mask) {
1972     args->exit_early = 1;
1973     return;
1974   }
1975 #endif  // CONFIG_MRC_TX
1976 
1977   if (!is_inter_block(mbmi)) {
1978     struct macroblock_plane *const p = &x->plane[plane];
1979     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
1980                                        p->eobs[block]);
1981     av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
1982                    tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
1983                    OUTPUT_HAS_DECODED_PIXELS);
1984   } else {
1985     av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
1986                    tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
1987                    OUTPUT_HAS_PREDICTED_PIXELS);
1988   }
1989 #if CONFIG_CFL
1990   if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
1991 #if CONFIG_CHROMA_SUB8X8
1992     assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
1993 #else
1994     assert(!is_inter_block(mbmi));
1995 #endif  // CONFIG_CHROMA_SUB8X8
1996     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
1997   }
1998 #endif  // CONFIG_CFL
1999   rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
2000   if (args->this_rd + rd > args->best_rd) {
2001     args->exit_early = 1;
2002     return;
2003   }
2004 #if !CONFIG_PVQ
2005   const PLANE_TYPE plane_type = get_plane_type(plane);
2006   const TX_TYPE tx_type =
2007       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
2008   const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
2009   this_rd_stats.rate =
2010       av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
2011                       scan_order, a, l, args->use_fast_coef_costing);
2012 #else   // !CONFIG_PVQ
2013   this_rd_stats.rate = x->rate;
2014 #endif  // !CONFIG_PVQ
2015 #else   // !CONFIG_TXK_SEL
2016   av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
2017                       tx_size, a, l, args->use_fast_coef_costing,
2018                       &this_rd_stats);
2019 #endif  // !CONFIG_TXK_SEL
2020 
2021 #if !CONFIG_PVQ
2022 #if CONFIG_RD_DEBUG
2023   av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
2024                             this_rd_stats.rate);
2025 #endif  // CONFIG_RD_DEBUG
2026   av1_set_txb_context(x, plane, block, tx_size, a, l);
2027 #endif  // !CONFIG_PVQ
2028 
2029   rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
2030   rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
2031 
2032   // TODO(jingning): temporarily enabled only for luma component
2033   rd = AOMMIN(rd1, rd2);
2034 
2035 #if !CONFIG_PVQ
2036   this_rd_stats.skip &= !x->plane[plane].eobs[block];
2037 #else
2038   this_rd_stats.skip &= x->pvq_skip[plane];
2039 #endif  // !CONFIG_PVQ
2040   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
2041 
2042   args->this_rd += rd;
2043 
2044 #if CONFIG_DIST_8X8
2045   if (!disable_early_skip)
2046 #endif
2047     if (args->this_rd > args->best_rd) {
2048       args->exit_early = 1;
2049       return;
2050     }
2051 }
2052 
2053 #if CONFIG_DIST_8X8
dist_8x8_sub8x8_txfm_rd(const AV1_COMP * const cpi,MACROBLOCK * x,BLOCK_SIZE bsize,struct rdcost_block_args * args)2054 static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
2055                                     BLOCK_SIZE bsize,
2056                                     struct rdcost_block_args *args) {
2057   MACROBLOCKD *const xd = &x->e_mbd;
2058   const struct macroblockd_plane *const pd = &xd->plane[0];
2059   const struct macroblock_plane *const p = &x->plane[0];
2060   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2061   const int src_stride = p->src.stride;
2062   const int dst_stride = pd->dst.stride;
2063   const uint8_t *src = &p->src.buf[0];
2064   const uint8_t *dst = &pd->dst.buf[0];
2065   const int16_t *pred = &pd->pred[0];
2066   int bw = block_size_wide[bsize];
2067   int bh = block_size_high[bsize];
2068   int visible_w = bw;
2069   int visible_h = bh;
2070 
2071   int i, j;
2072   int64_t rd, rd1, rd2;
2073   unsigned int tmp1, tmp2;
2074   int qindex = x->qindex;
2075 
2076   assert((bw & 0x07) == 0);
2077   assert((bh & 0x07) == 0);
2078 
2079   get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
2080                      &visible_h);
2081 
2082 #if CONFIG_HIGHBITDEPTH
2083   uint8_t *pred8;
2084   DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
2085 
2086   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
2087     pred8 = CONVERT_TO_BYTEPTR(pred16);
2088   else
2089     pred8 = (uint8_t *)pred16;
2090 #else
2091   DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
2092 #endif  // CONFIG_HIGHBITDEPTH
2093 
2094 #if CONFIG_HIGHBITDEPTH
2095   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2096     for (j = 0; j < bh; j++)
2097       for (i = 0; i < bw; i++)
2098         CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
2099   } else {
2100 #endif
2101     for (j = 0; j < bh; j++)
2102       for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
2103 #if CONFIG_HIGHBITDEPTH
2104   }
2105 #endif  // CONFIG_HIGHBITDEPTH
2106 
2107   tmp1 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw,
2108                                 bh, visible_w, visible_h, qindex);
2109   tmp2 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize,
2110                                 bw, bh, visible_w, visible_h, qindex);
2111 
2112   if (!is_inter_block(mbmi)) {
2113     if (x->tune_metric == AOM_TUNE_PSNR) {
2114       assert(args->rd_stats.sse == tmp1 * 16);
2115       assert(args->rd_stats.dist == tmp2 * 16);
2116     }
2117     args->rd_stats.sse = (int64_t)tmp1 * 16;
2118     args->rd_stats.dist = (int64_t)tmp2 * 16;
2119   } else {
2120     // For inter mode, the decoded pixels are provided in pd->pred,
2121     // while the predicted pixels are in dst.
2122     if (x->tune_metric == AOM_TUNE_PSNR) {
2123       assert(args->rd_stats.sse == tmp2 * 16);
2124       assert(args->rd_stats.dist == tmp1 * 16);
2125     }
2126     args->rd_stats.sse = (int64_t)tmp2 * 16;
2127     args->rd_stats.dist = (int64_t)tmp1 * 16;
2128   }
2129 
2130   rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
2131   rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
2132   rd = AOMMIN(rd1, rd2);
2133 
2134   args->rd_stats.rdcost = rd;
2135   args->this_rd = rd;
2136 
2137   if (args->this_rd > args->best_rd) args->exit_early = 1;
2138 }
2139 #endif  // CONFIG_DIST_8X8
2140 
txfm_rd_in_plane(MACROBLOCK * x,const AV1_COMP * cpi,RD_STATS * rd_stats,int64_t ref_best_rd,int plane,BLOCK_SIZE bsize,TX_SIZE tx_size,int use_fast_coef_casting)2141 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
2142                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
2143                              BLOCK_SIZE bsize, TX_SIZE tx_size,
2144                              int use_fast_coef_casting) {
2145   MACROBLOCKD *const xd = &x->e_mbd;
2146   const struct macroblockd_plane *const pd = &xd->plane[plane];
2147   struct rdcost_block_args args;
2148   av1_zero(args);
2149   args.x = x;
2150   args.cpi = cpi;
2151   args.best_rd = ref_best_rd;
2152   args.use_fast_coef_costing = use_fast_coef_casting;
2153   av1_init_rd_stats(&args.rd_stats);
2154 
2155   if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
2156 
2157   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
2158 
2159   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
2160                                          &args);
2161 #if CONFIG_DIST_8X8
2162   if (x->using_dist_8x8 && !args.exit_early && plane == 0 &&
2163       bsize >= BLOCK_8X8 &&
2164       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
2165     dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
2166 #endif
2167 
2168   if (args.exit_early) {
2169     av1_invalid_rd_stats(rd_stats);
2170   } else {
2171     *rd_stats = args.rd_stats;
2172   }
2173 }
2174 
2175 #if CONFIG_SUPERTX
av1_txfm_rd_in_plane_supertx(MACROBLOCK * x,const AV1_COMP * cpi,int * rate,int64_t * distortion,int * skippable,int64_t * sse,int64_t ref_best_rd,int plane,BLOCK_SIZE bsize,TX_SIZE tx_size,int use_fast_coef_casting)2176 void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
2177                                   int64_t *distortion, int *skippable,
2178                                   int64_t *sse, int64_t ref_best_rd, int plane,
2179                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
2180                                   int use_fast_coef_casting) {
2181   MACROBLOCKD *const xd = &x->e_mbd;
2182   const struct macroblockd_plane *const pd = &xd->plane[plane];
2183   struct rdcost_block_args args;
2184   av1_zero(args);
2185   args.cpi = cpi;
2186   args.x = x;
2187   args.best_rd = ref_best_rd;
2188   args.use_fast_coef_costing = use_fast_coef_casting;
2189 
2190 #if CONFIG_EXT_TX
2191   assert(tx_size < TX_SIZES);
2192 #endif  // CONFIG_EXT_TX
2193 
2194   if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
2195 
2196   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
2197 
2198   block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
2199                 &args);
2200 
2201   if (args.exit_early) {
2202     *rate = INT_MAX;
2203     *distortion = INT64_MAX;
2204     *sse = INT64_MAX;
2205     *skippable = 0;
2206   } else {
2207     *distortion = args.rd_stats.dist;
2208     *rate = args.rd_stats.rate;
2209     *sse = args.rd_stats.sse;
2210     *skippable = !x->plane[plane].eobs[0];
2211   }
2212 }
2213 #endif  // CONFIG_SUPERTX
2214 
tx_size_cost(const AV1_COMP * const cpi,const MACROBLOCK * const x,BLOCK_SIZE bsize,TX_SIZE tx_size)2215 static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
2216                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
2217   const AV1_COMMON *const cm = &cpi->common;
2218   const MACROBLOCKD *const xd = &x->e_mbd;
2219   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2220 
2221   if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
2222     const int is_inter = is_inter_block(mbmi);
2223     const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
2224                                          : intra_tx_size_cat_lookup[bsize];
2225     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
2226     const int depth = tx_size_to_depth(coded_tx_size);
2227     const int tx_size_ctx = get_tx_size_context(xd);
2228     int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
2229 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
2230     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
2231       r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
2232                                 tx_size == quarter_txsize_lookup[bsize]);
2233 #endif
2234     return r_tx_size;
2235   } else {
2236     return 0;
2237   }
2238 }
2239 
2240 #if CONFIG_LGT_FROM_PRED
av1_lgt_cost(const AV1_COMMON * cm,const MACROBLOCK * x,const MACROBLOCKD * xd,BLOCK_SIZE bsize,int plane,TX_SIZE tx_size,int use_lgt)2241 int av1_lgt_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
2242                  const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
2243                  TX_SIZE tx_size, int use_lgt) {
2244   if (plane > 0) return 0;
2245   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2246   const int is_inter = is_inter_block(mbmi);
2247 
2248   assert(is_lgt_allowed(mbmi->mode, tx_size));
2249   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
2250       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
2251     const int ext_tx_set =
2252         get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
2253     if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
2254         ALLOW_INTRA_EXT_TX)
2255       return x->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode][use_lgt];
2256     if (LGT_FROM_PRED_INTRA && is_inter && ext_tx_set > 0)
2257       return x->inter_lgt_cost[txsize_sqr_map[tx_size]][use_lgt];
2258   }
2259   return 0;
2260 }
2261 #endif  // CONFIG_LGT_FROM_PRED
2262 
2263 // TODO(angiebird): use this function whenever it's possible
av1_tx_type_cost(const AV1_COMMON * cm,const MACROBLOCK * x,const MACROBLOCKD * xd,BLOCK_SIZE bsize,int plane,TX_SIZE tx_size,TX_TYPE tx_type)2264 int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
2265                      const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
2266                      TX_SIZE tx_size, TX_TYPE tx_type) {
2267   if (plane > 0) return 0;
2268 
2269 #if CONFIG_LGT_FROM_PRED
2270   assert(!xd->mi[0]->mbmi.use_lgt);
2271 #endif
2272 #if CONFIG_VAR_TX
2273   tx_size = get_min_tx_size(tx_size);
2274 #endif
2275 
2276   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2277   const int is_inter = is_inter_block(mbmi);
2278 #if CONFIG_EXT_TX
2279   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
2280       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
2281     const int ext_tx_set =
2282         get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
2283     if (is_inter) {
2284       if (ext_tx_set > 0)
2285         return x
2286             ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
2287     } else {
2288       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
2289         return x->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
2290                                      [mbmi->mode][tx_type];
2291     }
2292   }
2293 #else
2294   (void)bsize;
2295   (void)cm;
2296   if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
2297       !FIXED_TX_TYPE) {
2298     if (is_inter) {
2299       return x->inter_tx_type_costs[tx_size][tx_type];
2300     } else {
2301       return x->intra_tx_type_costs[tx_size]
2302                                    [intra_mode_to_tx_type_context[mbmi->mode]]
2303                                    [tx_type];
2304     }
2305   }
2306 #endif  // CONFIG_EXT_TX
2307   return 0;
2308 }
txfm_yrd(const AV1_COMP * const cpi,MACROBLOCK * x,RD_STATS * rd_stats,int64_t ref_best_rd,BLOCK_SIZE bs,TX_TYPE tx_type,TX_SIZE tx_size)2309 static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
2310                         RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
2311                         TX_TYPE tx_type, TX_SIZE tx_size) {
2312   const AV1_COMMON *const cm = &cpi->common;
2313   MACROBLOCKD *const xd = &x->e_mbd;
2314   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2315   int64_t rd = INT64_MAX;
2316   aom_prob skip_prob = av1_get_skip_prob(cm, xd);
2317   int s0, s1;
2318   const int is_inter = is_inter_block(mbmi);
2319   const int tx_select =
2320       cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
2321 
2322   const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
2323 
2324 #if CONFIG_PVQ
2325   assert(tx_size >= TX_4X4);
2326 #endif  // CONFIG_PVQ
2327   assert(skip_prob > 0);
2328 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2329   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
2330 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2331 
2332   s0 = av1_cost_bit(skip_prob, 0);
2333   s1 = av1_cost_bit(skip_prob, 1);
2334 
2335   mbmi->tx_type = tx_type;
2336   mbmi->tx_size = tx_size;
2337   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size,
2338                    cpi->sf.use_fast_coef_costing);
2339   if (rd_stats->rate == INT_MAX) return INT64_MAX;
2340 #if !CONFIG_TXK_SEL
2341   int plane = 0;
2342 #if CONFIG_LGT_FROM_PRED
2343   if (is_lgt_allowed(mbmi->mode, tx_size))
2344     rd_stats->rate +=
2345         av1_lgt_cost(cm, x, xd, bs, plane, tx_size, mbmi->use_lgt);
2346   if (!mbmi->use_lgt)
2347     rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
2348 #else
2349   rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
2350 #endif  // CONFIG_LGT_FROM_PRED
2351 #endif
2352 
2353   if (rd_stats->skip) {
2354     if (is_inter) {
2355       rd = RDCOST(x->rdmult, s1, rd_stats->sse);
2356     } else {
2357       rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
2358     }
2359   } else {
2360     rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
2361                 rd_stats->dist);
2362   }
2363 
2364   if (tx_select) rd_stats->rate += r_tx_size;
2365 
2366   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
2367       !(rd_stats->skip))
2368     rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
2369 
2370   return rd;
2371 }
2372 
skip_txfm_search(const AV1_COMP * cpi,MACROBLOCK * x,BLOCK_SIZE bs,TX_TYPE tx_type,TX_SIZE tx_size)2373 static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
2374                             TX_TYPE tx_type, TX_SIZE tx_size) {
2375   const MACROBLOCKD *const xd = &x->e_mbd;
2376   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2377   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
2378   const int is_inter = is_inter_block(mbmi);
2379   int prune = 0;
2380   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
2381     // passing -1 in for tx_type indicates that all 1D
2382     // transforms should be considered for pruning
2383     prune = prune_tx_types(cpi, bs, x, xd, -1);
2384 
2385 #if CONFIG_MRC_TX
2386   // MRC_DCT only implemented for TX_32X32 so only include this tx in
2387   // the search for TX_32X32
2388   if (tx_type == MRC_DCT &&
2389       ((is_inter && !USE_MRC_INTER) || (!is_inter && !USE_MRC_INTRA) ||
2390        tx_size != TX_32X32))
2391     return 1;
2392 #endif  // CONFIG_MRC_TX
2393 #if CONFIG_LGT_FROM_PRED
2394   if (mbmi->use_lgt && mbmi->ref_mv_idx > 0) return 1;
2395 #endif  // CONFIG_LGT_FROM_PRED
2396   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
2397   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
2398     return 1;
2399   if (!is_inter && x->use_default_intra_tx_type &&
2400       tx_type != get_default_tx_type(0, xd, 0, tx_size))
2401     return 1;
2402   if (is_inter && x->use_default_inter_tx_type &&
2403       tx_type != get_default_tx_type(0, xd, 0, tx_size))
2404     return 1;
2405   if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
2406 #if CONFIG_EXT_TX
2407   const AV1_COMMON *const cm = &cpi->common;
2408   const TxSetType tx_set_type =
2409       get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
2410   if (!av1_ext_tx_used[tx_set_type][tx_type]) return 1;
2411   if (is_inter) {
2412     if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
2413       if (!do_tx_type_search(tx_type, prune)) return 1;
2414     }
2415   } else {
2416     if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
2417       if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
2418     }
2419   }
2420 #else   // CONFIG_EXT_TX
2421   if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
2422   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
2423       !do_tx_type_search(tx_type, prune))
2424     return 1;
2425 #endif  // CONFIG_EXT_TX
2426   return 0;
2427 }
2428 
2429 #if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
estimate_yrd_for_sb(const AV1_COMP * const cpi,BLOCK_SIZE bs,MACROBLOCK * x,int * r,int64_t * d,int * s,int64_t * sse,int64_t ref_best_rd)2430 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
2431                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
2432                                    int64_t *sse, int64_t ref_best_rd) {
2433   RD_STATS rd_stats;
2434   int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT,
2435                         max_txsize_lookup[bs]);
2436   *r = rd_stats.rate;
2437   *d = rd_stats.dist;
2438   *s = rd_stats.skip;
2439   *sse = rd_stats.sse;
2440   return rd;
2441 }
2442 #endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
2443 
choose_largest_tx_size(const AV1_COMP * const cpi,MACROBLOCK * x,RD_STATS * rd_stats,int64_t ref_best_rd,BLOCK_SIZE bs)2444 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
2445                                    RD_STATS *rd_stats, int64_t ref_best_rd,
2446                                    BLOCK_SIZE bs) {
2447   const AV1_COMMON *const cm = &cpi->common;
2448   MACROBLOCKD *const xd = &x->e_mbd;
2449   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2450   TX_TYPE tx_type, best_tx_type = DCT_DCT;
2451   int64_t this_rd, best_rd = INT64_MAX;
2452   aom_prob skip_prob = av1_get_skip_prob(cm, xd);
2453   int s0 = av1_cost_bit(skip_prob, 0);
2454   int s1 = av1_cost_bit(skip_prob, 1);
2455   const int is_inter = is_inter_block(mbmi);
2456   int prune = 0;
2457   const int plane = 0;
2458 #if CONFIG_LGT_FROM_PRED
2459   int is_lgt_best = 0;
2460   int search_lgt = is_inter
2461                        ? LGT_FROM_PRED_INTER && !x->use_default_inter_tx_type &&
2462                              !cpi->sf.tx_type_search.prune_mode > NO_PRUNE
2463                        : LGT_FROM_PRED_INTRA && !x->use_default_intra_tx_type &&
2464                              ALLOW_INTRA_EXT_TX;
2465 #endif  // CONFIG_LGT_FROM_PRED
2466   av1_invalid_rd_stats(rd_stats);
2467 
2468   mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2469 #if CONFIG_VAR_TX
2470   mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
2471 #endif  // CONFIG_VAR_TX
2472 #if CONFIG_EXT_TX
2473   int ext_tx_set =
2474       get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
2475   const TxSetType tx_set_type =
2476       get_ext_tx_set_type(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
2477 #endif  // CONFIG_EXT_TX
2478 
2479   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
2480 #if CONFIG_EXT_TX
2481     prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
2482 #else
2483     prune = prune_tx_types(cpi, bs, x, xd, 0);
2484 #endif  // CONFIG_EXT_TX
2485 #if CONFIG_EXT_TX
2486   if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) >
2487           1 &&
2488       !xd->lossless[mbmi->segment_id]) {
2489 #if CONFIG_PVQ
2490     od_rollback_buffer pre_buf, post_buf;
2491 
2492     od_encode_checkpoint(&x->daala_enc, &pre_buf);
2493     od_encode_checkpoint(&x->daala_enc, &post_buf);
2494 #endif  // CONFIG_PVQ
2495 
2496     for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
2497       if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
2498       RD_STATS this_rd_stats;
2499       if (is_inter) {
2500         if (x->use_default_inter_tx_type &&
2501             tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
2502           continue;
2503         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
2504           if (!do_tx_type_search(tx_type, prune)) continue;
2505         }
2506       } else {
2507         if (x->use_default_intra_tx_type &&
2508             tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
2509           continue;
2510         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
2511           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
2512         }
2513       }
2514 
2515       mbmi->tx_type = tx_type;
2516 
2517       txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
2518                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
2519 #if CONFIG_PVQ
2520       od_encode_rollback(&x->daala_enc, &pre_buf);
2521 #endif  // CONFIG_PVQ
2522       if (this_rd_stats.rate == INT_MAX) continue;
2523       av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
2524 
2525       if (this_rd_stats.skip)
2526         this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
2527       else
2528         this_rd =
2529             RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
2530       if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
2531           !this_rd_stats.skip)
2532         this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
2533 
2534       if (this_rd < best_rd) {
2535         best_rd = this_rd;
2536         best_tx_type = mbmi->tx_type;
2537         *rd_stats = this_rd_stats;
2538 #if CONFIG_PVQ
2539         od_encode_checkpoint(&x->daala_enc, &post_buf);
2540 #endif  // CONFIG_PVQ
2541       }
2542     }
2543 #if CONFIG_PVQ
2544     od_encode_rollback(&x->daala_enc, &post_buf);
2545 #endif  // CONFIG_PVQ
2546 #if CONFIG_LGT_FROM_PRED
2547     // search LGT
2548     if (search_lgt && is_lgt_allowed(mbmi->mode, mbmi->tx_size) &&
2549         !cm->reduced_tx_set_used) {
2550       RD_STATS this_rd_stats;
2551       mbmi->use_lgt = 1;
2552       txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
2553                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
2554       if (this_rd_stats.rate != INT_MAX) {
2555         av1_lgt_cost(cm, x, xd, bs, plane, mbmi->tx_size, 1);
2556         if (this_rd_stats.skip)
2557           this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
2558         else
2559           this_rd =
2560               RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
2561         if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
2562             !this_rd_stats.skip)
2563           this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
2564         if (this_rd < best_rd) {
2565           best_rd = this_rd;
2566           is_lgt_best = 1;
2567           *rd_stats = this_rd_stats;
2568         }
2569       }
2570       mbmi->use_lgt = 0;
2571     }
2572 #endif  // CONFIG_LGT_FROM_PRED
2573   } else {
2574     mbmi->tx_type = DCT_DCT;
2575     txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
2576                      cpi->sf.use_fast_coef_costing);
2577   }
2578 #else   // CONFIG_EXT_TX
2579   if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
2580     for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
2581       RD_STATS this_rd_stats;
2582       if (!is_inter && x->use_default_intra_tx_type &&
2583           tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
2584         continue;
2585       if (is_inter && x->use_default_inter_tx_type &&
2586           tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
2587         continue;
2588       mbmi->tx_type = tx_type;
2589       txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
2590                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
2591       if (this_rd_stats.rate == INT_MAX) continue;
2592 
2593       av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
2594       if (is_inter) {
2595         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
2596             !do_tx_type_search(tx_type, prune))
2597           continue;
2598       }
2599       if (this_rd_stats.skip)
2600         this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
2601       else
2602         this_rd =
2603             RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
2604       if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
2605         this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
2606 
2607       if (this_rd < best_rd) {
2608         best_rd = this_rd;
2609         best_tx_type = mbmi->tx_type;
2610         *rd_stats = this_rd_stats;
2611       }
2612     }
2613   } else {
2614     mbmi->tx_type = DCT_DCT;
2615     txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
2616                      cpi->sf.use_fast_coef_costing);
2617   }
2618 #endif  // CONFIG_EXT_TX
2619   mbmi->tx_type = best_tx_type;
2620 #if CONFIG_LGT_FROM_PRED
2621   mbmi->use_lgt = is_lgt_best;
2622 #endif  // CONFIG_LGT_FROM_PRED
2623 }
2624 
choose_smallest_tx_size(const AV1_COMP * const cpi,MACROBLOCK * x,RD_STATS * rd_stats,int64_t ref_best_rd,BLOCK_SIZE bs)2625 static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
2626                                     RD_STATS *rd_stats, int64_t ref_best_rd,
2627                                     BLOCK_SIZE bs) {
2628   MACROBLOCKD *const xd = &x->e_mbd;
2629   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2630 
2631   mbmi->tx_size = TX_4X4;
2632   mbmi->tx_type = DCT_DCT;
2633 #if CONFIG_VAR_TX
2634   mbmi->min_tx_size = get_min_tx_size(TX_4X4);
2635 #endif  // CONFIG_VAR_TX
2636 
2637   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
2638                    cpi->sf.use_fast_coef_costing);
2639 }
2640 
2641 #if CONFIG_TXK_SEL || CONFIG_VAR_TX
bsize_to_num_blk(BLOCK_SIZE bsize)2642 static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
2643   int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
2644   return num_blk;
2645 }
2646 #endif  // CONFIG_TXK_SEL || CONFIG_VAR_TX
2647 
choose_tx_size_type_from_rd(const AV1_COMP * const cpi,MACROBLOCK * x,RD_STATS * rd_stats,int64_t ref_best_rd,BLOCK_SIZE bs)2648 static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
2649                                         MACROBLOCK *x, RD_STATS *rd_stats,
2650                                         int64_t ref_best_rd, BLOCK_SIZE bs) {
2651   const AV1_COMMON *const cm = &cpi->common;
2652   MACROBLOCKD *const xd = &x->e_mbd;
2653   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2654   int64_t rd = INT64_MAX;
2655   int n;
2656   int start_tx, end_tx;
2657   int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
2658   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
2659   TX_SIZE best_tx_size = max_tx_size;
2660   TX_TYPE best_tx_type = DCT_DCT;
2661 #if CONFIG_LGT_FROM_PRED
2662   int breakout = 0;
2663   int is_lgt_best = 0;
2664   mbmi->use_lgt = 0;
2665 #endif  // CONFIG_LGT_FROM_PRED
2666 #if CONFIG_TXK_SEL
2667   TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
2668 #endif  // CONFIG_TXK_SEL
2669   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
2670   const int is_inter = is_inter_block(mbmi);
2671 #if CONFIG_PVQ
2672   od_rollback_buffer buf;
2673   od_encode_checkpoint(&x->daala_enc, &buf);
2674 #endif  // CONFIG_PVQ
2675 
2676   av1_invalid_rd_stats(rd_stats);
2677 
2678 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2679   int evaluate_rect_tx = 0;
2680   if (tx_select) {
2681     evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
2682   } else {
2683     const TX_SIZE chosen_tx_size =
2684         tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2685     evaluate_rect_tx = is_rect_tx(chosen_tx_size);
2686     assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
2687   }
2688   if (evaluate_rect_tx) {
2689     TX_TYPE tx_start = DCT_DCT;
2690     TX_TYPE tx_end = TX_TYPES;
2691 #if CONFIG_TXK_SEL
2692     // The tx_type becomes dummy when lv_map is on. The tx_type search will be
2693     // performed in av1_search_txk_type()
2694     tx_end = DCT_DCT + 1;
2695 #endif
2696     TX_TYPE tx_type;
2697     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
2698       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
2699       const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
2700       RD_STATS this_rd_stats;
2701       const TxSetType tx_set_type = get_ext_tx_set_type(
2702           rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
2703       if (av1_ext_tx_used[tx_set_type][tx_type]) {
2704         rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
2705                       rect_tx_size);
2706         ref_best_rd = AOMMIN(rd, ref_best_rd);
2707         if (rd < best_rd) {
2708 #if CONFIG_TXK_SEL
2709           memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
2710 #endif
2711           best_tx_type = tx_type;
2712           best_tx_size = rect_tx_size;
2713           best_rd = rd;
2714           *rd_stats = this_rd_stats;
2715         }
2716       }
2717 #if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2718       const int is_inter = is_inter_block(mbmi);
2719       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
2720 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2721     }
2722 #if CONFIG_LGT_FROM_PRED
2723     const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
2724     if (is_lgt_allowed(mbmi->mode, rect_tx_size) && !cm->reduced_tx_set_used) {
2725       RD_STATS this_rd_stats;
2726       mbmi->use_lgt = 1;
2727       rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, rect_tx_size);
2728       if (rd < best_rd) {
2729         is_lgt_best = 1;
2730         best_tx_size = rect_tx_size;
2731         best_rd = rd;
2732         *rd_stats = this_rd_stats;
2733       }
2734       mbmi->use_lgt = 0;
2735     }
2736 #endif  // CONFIG_LGT_FROM_PRED
2737   }
2738 
2739 #if CONFIG_RECT_TX_EXT
2740   // test 1:4/4:1 tx
2741   int evaluate_quarter_tx = 0;
2742   if (is_quarter_tx_allowed(xd, mbmi, is_inter)) {
2743     if (tx_select) {
2744       evaluate_quarter_tx = 1;
2745     } else {
2746       const TX_SIZE chosen_tx_size =
2747           tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2748       evaluate_quarter_tx = chosen_tx_size == quarter_txsize_lookup[bs];
2749     }
2750   }
2751   if (evaluate_quarter_tx) {
2752     TX_TYPE tx_start = DCT_DCT;
2753     TX_TYPE tx_end = TX_TYPES;
2754 #if CONFIG_TXK_SEL
2755     // The tx_type becomes dummy when lv_map is on. The tx_type search will be
2756     // performed in av1_search_txk_type()
2757     tx_end = DCT_DCT + 1;
2758 #endif
2759     TX_TYPE tx_type;
2760     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
2761       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
2762       const TX_SIZE tx_size = quarter_txsize_lookup[bs];
2763       RD_STATS this_rd_stats;
2764       const TxSetType tx_set_type =
2765           get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
2766       if (av1_ext_tx_used[tx_set_type][tx_type]) {
2767         rd =
2768             txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
2769         if (rd < best_rd) {
2770 #if CONFIG_TXK_SEL
2771           memcpy(best_txk_type, mbmi->txk_type,
2772                  sizeof(best_txk_type[0]) * num_blk);
2773 #endif
2774           best_tx_type = tx_type;
2775 #if CONFIG_LGT_FROM_PRED
2776           is_lgt_best = 0;
2777 #endif
2778           best_tx_size = tx_size;
2779           best_rd = rd;
2780           *rd_stats = this_rd_stats;
2781         }
2782       }
2783 #if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2784       const int is_inter = is_inter_block(mbmi);
2785       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
2786 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2787     }
2788 #if CONFIG_LGT_FROM_PRED
2789     if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
2790       const TX_SIZE tx_size = quarter_txsize_lookup[bs];
2791       RD_STATS this_rd_stats;
2792       mbmi->use_lgt = 1;
2793       rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, tx_size);
2794       if (rd < best_rd) {
2795         is_lgt_best = 1;
2796         best_tx_size = tx_size;
2797         best_rd = rd;
2798         *rd_stats = this_rd_stats;
2799       }
2800       mbmi->use_lgt = 0;
2801     }
2802 #endif  // CONFIG_LGT_FROM_PRED
2803   }
2804 #endif  // CONFIG_RECT_TX_EXT
2805 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2806 
2807   if (tx_select) {
2808     start_tx = max_tx_size;
2809     end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4;
2810   } else {
2811     const TX_SIZE chosen_tx_size =
2812         tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2813     start_tx = chosen_tx_size;
2814     end_tx = chosen_tx_size;
2815   }
2816 
2817   last_rd = INT64_MAX;
2818   for (n = start_tx; n >= end_tx; --n) {
2819 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2820     if (is_rect_tx(n)) break;
2821 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2822     TX_TYPE tx_start = DCT_DCT;
2823     TX_TYPE tx_end = TX_TYPES;
2824 #if CONFIG_TXK_SEL
2825     // The tx_type becomes dummy when lv_map is on. The tx_type search will be
2826     // performed in av1_search_txk_type()
2827     tx_end = DCT_DCT + 1;
2828 #endif
2829     TX_TYPE tx_type;
2830     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
2831       RD_STATS this_rd_stats;
2832       if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue;
2833       rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n);
2834 #if CONFIG_PVQ
2835       od_encode_rollback(&x->daala_enc, &buf);
2836 #endif  // CONFIG_PVQ
2837       // Early termination in transform size search.
2838       if (cpi->sf.tx_size_search_breakout &&
2839           (rd == INT64_MAX ||
2840            (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
2841            (n < (int)max_tx_size && rd > last_rd))) {
2842 #if CONFIG_LGT_FROM_PRED
2843         breakout = 1;
2844 #endif
2845         break;
2846       }
2847 
2848       last_rd = rd;
2849       ref_best_rd = AOMMIN(rd, ref_best_rd);
2850       if (rd < best_rd) {
2851 #if CONFIG_TXK_SEL
2852         memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
2853 #endif
2854         best_tx_type = tx_type;
2855 #if CONFIG_LGT_FROM_PRED
2856         is_lgt_best = 0;
2857 #endif
2858         best_tx_size = n;
2859         best_rd = rd;
2860         *rd_stats = this_rd_stats;
2861       }
2862 #if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2863       const int is_inter = is_inter_block(mbmi);
2864       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
2865 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2866     }
2867 #if CONFIG_LGT_FROM_PRED
2868     mbmi->use_lgt = 1;
2869     if (is_lgt_allowed(mbmi->mode, n) && !skip_txfm_search(cpi, x, bs, 0, n) &&
2870         !breakout) {
2871       RD_STATS this_rd_stats;
2872       rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, n);
2873       if (rd < best_rd) {
2874         is_lgt_best = 1;
2875         best_tx_size = n;
2876         best_rd = rd;
2877         *rd_stats = this_rd_stats;
2878       }
2879     }
2880     mbmi->use_lgt = 0;
2881 #endif  // CONFIG_LGT_FROM_PRED
2882   }
2883   mbmi->tx_size = best_tx_size;
2884   mbmi->tx_type = best_tx_type;
2885 #if CONFIG_LGT_FROM_PRED
2886   mbmi->use_lgt = is_lgt_best;
2887   assert(!is_lgt_best || is_lgt_allowed(mbmi->mode, mbmi->tx_size));
2888 #endif  // CONFIG_LGT_FROM_PRED
2889 #if CONFIG_TXK_SEL
2890   memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
2891 #endif
2892 
2893 #if CONFIG_VAR_TX
2894   mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
2895 #endif  // CONFIG_VAR_TX
2896 
2897 #if !CONFIG_EXT_TX
2898   if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
2899 #endif  // !CONFIG_EXT_TX
2900 #if CONFIG_PVQ
2901   if (best_rd != INT64_MAX) {
2902     txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size);
2903   }
2904 #endif  // CONFIG_PVQ
2905 }
2906 
super_block_yrd(const AV1_COMP * const cpi,MACROBLOCK * x,RD_STATS * rd_stats,BLOCK_SIZE bs,int64_t ref_best_rd)2907 static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
2908                             RD_STATS *rd_stats, BLOCK_SIZE bs,
2909                             int64_t ref_best_rd) {
2910   MACROBLOCKD *xd = &x->e_mbd;
2911   av1_init_rd_stats(rd_stats);
2912 
2913   assert(bs == xd->mi[0]->mbmi.sb_type);
2914 
2915   if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
2916     choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
2917   } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
2918     choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
2919   } else {
2920     choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
2921   }
2922 }
2923 
conditional_skipintra(PREDICTION_MODE mode,PREDICTION_MODE best_intra_mode)2924 static int conditional_skipintra(PREDICTION_MODE mode,
2925                                  PREDICTION_MODE best_intra_mode) {
2926   if (mode == D117_PRED && best_intra_mode != V_PRED &&
2927       best_intra_mode != D135_PRED)
2928     return 1;
2929   if (mode == D63_PRED && best_intra_mode != V_PRED &&
2930       best_intra_mode != D45_PRED)
2931     return 1;
2932   if (mode == D207_PRED && best_intra_mode != H_PRED &&
2933       best_intra_mode != D45_PRED)
2934     return 1;
2935   if (mode == D153_PRED && best_intra_mode != H_PRED &&
2936       best_intra_mode != D135_PRED)
2937     return 1;
2938   return 0;
2939 }
2940 
2941 // Model based RD estimation for luma intra blocks.
intra_model_yrd(const AV1_COMP * const cpi,MACROBLOCK * const x,BLOCK_SIZE bsize,int mode_cost)2942 static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
2943                                BLOCK_SIZE bsize, int mode_cost) {
2944   const AV1_COMMON *cm = &cpi->common;
2945   MACROBLOCKD *const xd = &x->e_mbd;
2946   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2947   assert(!is_inter_block(mbmi));
2948   RD_STATS this_rd_stats;
2949   int row, col;
2950   int64_t temp_sse, this_rd;
2951   const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0);
2952   const int stepr = tx_size_high_unit[tx_size];
2953   const int stepc = tx_size_wide_unit[tx_size];
2954   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
2955   const int max_blocks_high = max_block_high(xd, bsize, 0);
2956   mbmi->tx_size = tx_size;
2957   // Prediction.
2958   const int step = stepr * stepc;
2959   int block = 0;
2960   for (row = 0; row < max_blocks_high; row += stepr) {
2961     for (col = 0; col < max_blocks_wide; col += stepc) {
2962       av1_predict_intra_block_facade(cm, xd, 0, block, col, row, tx_size);
2963       block += step;
2964     }
2965   }
2966   // RD estimation.
2967   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
2968                   &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
2969 #if CONFIG_EXT_INTRA
2970   if (av1_is_directional_mode(mbmi->mode, bsize) &&
2971       av1_use_angle_delta(bsize)) {
2972     mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
2973                                     MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
2974   }
2975 #endif  // CONFIG_EXT_INTRA
2976 #if CONFIG_FILTER_INTRA
2977   if (mbmi->mode == DC_PRED) {
2978     const aom_prob prob = cpi->common.fc->filter_intra_probs[0];
2979     if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
2980       const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0];
2981       mode_cost += (av1_cost_bit(prob, 1) +
2982                     write_uniform_cost(FILTER_INTRA_MODES, mode));
2983     } else {
2984       mode_cost += av1_cost_bit(prob, 0);
2985     }
2986   }
2987 #endif  // CONFIG_FILTER_INTRA
2988   this_rd =
2989       RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
2990   return this_rd;
2991 }
2992 
2993 // Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
2994 // new_height'. Extra rows and columns are filled in by copying last valid
2995 // row/column.
extend_palette_color_map(uint8_t * const color_map,int orig_width,int orig_height,int new_width,int new_height)2996 static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
2997                                      int orig_height, int new_width,
2998                                      int new_height) {
2999   int j;
3000   assert(new_width >= orig_width);
3001   assert(new_height >= orig_height);
3002   if (new_width == orig_width && new_height == orig_height) return;
3003 
3004   for (j = orig_height - 1; j >= 0; --j) {
3005     memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
3006     // Copy last column to extra columns.
3007     memset(color_map + j * new_width + orig_width,
3008            color_map[j * new_width + orig_width - 1], new_width - orig_width);
3009   }
3010   // Copy last row to extra rows.
3011   for (j = orig_height; j < new_height; ++j) {
3012     memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
3013            new_width);
3014   }
3015 }
3016 
3017 #if CONFIG_PALETTE_DELTA_ENCODING
3018 // Bias toward using colors in the cache.
3019 // TODO(huisu): Try other schemes to improve compression.
optimize_palette_colors(uint16_t * color_cache,int n_cache,int n_colors,int stride,float * centroids)3020 static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
3021                                     int n_colors, int stride,
3022                                     float *centroids) {
3023   if (n_cache <= 0) return;
3024   for (int i = 0; i < n_colors * stride; i += stride) {
3025     float min_diff = fabsf(centroids[i] - color_cache[0]);
3026     int idx = 0;
3027     for (int j = 1; j < n_cache; ++j) {
3028       float this_diff = fabsf(centroids[i] - color_cache[j]);
3029       if (this_diff < min_diff) {
3030         min_diff = this_diff;
3031         idx = j;
3032       }
3033     }
3034     if (min_diff < 1.5) centroids[i] = color_cache[idx];
3035   }
3036 }
3037 #endif  // CONFIG_PALETTE_DELTA_ENCODING
3038 
rd_pick_palette_intra_sby(const AV1_COMP * const cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int palette_ctx,int dc_mode_cost,MB_MODE_INFO * best_mbmi,uint8_t * best_palette_color_map,int64_t * best_rd,int64_t * best_model_rd,int * rate,int * rate_tokenonly,int64_t * distortion,int * skippable)3039 static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
3040                                      BLOCK_SIZE bsize, int palette_ctx,
3041                                      int dc_mode_cost, MB_MODE_INFO *best_mbmi,
3042                                      uint8_t *best_palette_color_map,
3043                                      int64_t *best_rd, int64_t *best_model_rd,
3044                                      int *rate, int *rate_tokenonly,
3045                                      int64_t *distortion, int *skippable) {
3046   int rate_overhead = 0;
3047   MACROBLOCKD *const xd = &x->e_mbd;
3048   MODE_INFO *const mic = xd->mi[0];
3049   MB_MODE_INFO *const mbmi = &mic->mbmi;
3050   assert(!is_inter_block(mbmi));
3051   assert(bsize >= BLOCK_8X8);
3052   int this_rate, colors, n;
3053   const int src_stride = x->plane[0].src.stride;
3054   const uint8_t *const src = x->plane[0].src.buf;
3055   uint8_t *const color_map = xd->plane[0].color_index_map;
3056   int block_width, block_height, rows, cols;
3057   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
3058                            &cols);
3059 
3060   assert(cpi->common.allow_screen_content_tools);
3061 
3062 #if CONFIG_HIGHBITDEPTH
3063   if (cpi->common.use_highbitdepth)
3064     colors = av1_count_colors_highbd(src, src_stride, rows, cols,
3065                                      cpi->common.bit_depth);
3066   else
3067 #endif  // CONFIG_HIGHBITDEPTH
3068     colors = av1_count_colors(src, src_stride, rows, cols);
3069 #if CONFIG_FILTER_INTRA
3070   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
3071 #endif  // CONFIG_FILTER_INTRA
3072 
3073   if (colors > 1 && colors <= 64) {
3074     int r, c, i, k, palette_mode_cost;
3075     const int max_itr = 50;
3076     float *const data = x->palette_buffer->kmeans_data_buf;
3077     float centroids[PALETTE_MAX_SIZE];
3078     float lb, ub, val;
3079     RD_STATS tokenonly_rd_stats;
3080     int64_t this_rd, this_model_rd;
3081     PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
3082 #if CONFIG_HIGHBITDEPTH
3083     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
3084     if (cpi->common.use_highbitdepth)
3085       lb = ub = src16[0];
3086     else
3087 #endif  // CONFIG_HIGHBITDEPTH
3088       lb = ub = src[0];
3089 
3090 #if CONFIG_HIGHBITDEPTH
3091     if (cpi->common.use_highbitdepth) {
3092       for (r = 0; r < rows; ++r) {
3093         for (c = 0; c < cols; ++c) {
3094           val = src16[r * src_stride + c];
3095           data[r * cols + c] = val;
3096           if (val < lb)
3097             lb = val;
3098           else if (val > ub)
3099             ub = val;
3100         }
3101       }
3102     } else {
3103 #endif  // CONFIG_HIGHBITDEPTH
3104       for (r = 0; r < rows; ++r) {
3105         for (c = 0; c < cols; ++c) {
3106           val = src[r * src_stride + c];
3107           data[r * cols + c] = val;
3108           if (val < lb)
3109             lb = val;
3110           else if (val > ub)
3111             ub = val;
3112         }
3113       }
3114 #if CONFIG_HIGHBITDEPTH
3115     }
3116 #endif  // CONFIG_HIGHBITDEPTH
3117 
3118     mbmi->mode = DC_PRED;
3119 #if CONFIG_FILTER_INTRA
3120     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
3121 #endif  // CONFIG_FILTER_INTRA
3122 
3123     if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
3124 
3125 #if CONFIG_PALETTE_DELTA_ENCODING
3126     uint16_t color_cache[2 * PALETTE_MAX_SIZE];
3127     const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
3128 #endif  // CONFIG_PALETTE_DELTA_ENCODING
3129 
3130     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
3131          --n) {
3132       if (colors == PALETTE_MIN_SIZE) {
3133         // Special case: These colors automatically become the centroids.
3134         assert(colors == n);
3135         assert(colors == 2);
3136         centroids[0] = lb;
3137         centroids[1] = ub;
3138         k = 2;
3139       } else {
3140         for (i = 0; i < n; ++i) {
3141           centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
3142         }
3143         av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
3144 #if CONFIG_PALETTE_DELTA_ENCODING
3145         optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
3146 #endif  // CONFIG_PALETTE_DELTA_ENCODING
3147         k = av1_remove_duplicates(centroids, n);
3148         if (k < PALETTE_MIN_SIZE) {
3149           // Too few unique colors to create a palette. And DC_PRED will work
3150           // well for that case anyway. So skip.
3151           continue;
3152         }
3153       }
3154 
3155 #if CONFIG_HIGHBITDEPTH
3156       if (cpi->common.use_highbitdepth)
3157         for (i = 0; i < k; ++i)
3158           pmi->palette_colors[i] =
3159               clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
3160       else
3161 #endif  // CONFIG_HIGHBITDEPTH
3162         for (i = 0; i < k; ++i)
3163           pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
3164       pmi->palette_size[0] = k;
3165 
3166       av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
3167       extend_palette_color_map(color_map, cols, rows, block_width,
3168                                block_height);
3169       palette_mode_cost =
3170           dc_mode_cost +
3171           x->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
3172           write_uniform_cost(k, color_map[0]) +
3173           av1_cost_bit(
3174               av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
3175               1);
3176       palette_mode_cost += av1_palette_color_cost_y(pmi,
3177 #if CONFIG_PALETTE_DELTA_ENCODING
3178                                                     color_cache, n_cache,
3179 #endif  // CONFIG_PALETTE_DELTA_ENCODING
3180                                                     cpi->common.bit_depth);
3181       palette_mode_cost +=
3182           av1_cost_color_map(x, 0, 0, bsize, mbmi->tx_size, PALETTE_MAP);
3183       this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
3184       if (*best_model_rd != INT64_MAX &&
3185           this_model_rd > *best_model_rd + (*best_model_rd >> 1))
3186         continue;
3187       if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
3188       super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
3189       if (tokenonly_rd_stats.rate == INT_MAX) continue;
3190       this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
3191       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
3192       if (!xd->lossless[mbmi->segment_id] &&
3193           block_signals_txsize(mbmi->sb_type)) {
3194         tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
3195       }
3196       if (this_rd < *best_rd) {
3197         *best_rd = this_rd;
3198         memcpy(best_palette_color_map, color_map,
3199                block_width * block_height * sizeof(color_map[0]));
3200         *best_mbmi = *mbmi;
3201         rate_overhead = this_rate - tokenonly_rd_stats.rate;
3202         if (rate) *rate = this_rate;
3203         if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
3204         if (distortion) *distortion = tokenonly_rd_stats.dist;
3205         if (skippable) *skippable = tokenonly_rd_stats.skip;
3206       }
3207     }
3208   }
3209 
3210   if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
3211     memcpy(color_map, best_palette_color_map,
3212            block_width * block_height * sizeof(best_palette_color_map[0]));
3213   }
3214   *mbmi = *best_mbmi;
3215   return rate_overhead;
3216 }
3217 
rd_pick_intra_sub_8x8_y_subblock_mode(const AV1_COMP * const cpi,MACROBLOCK * x,int row,int col,PREDICTION_MODE * best_mode,const int * bmode_costs,ENTROPY_CONTEXT * a,ENTROPY_CONTEXT * l,int * bestrate,int * bestratey,int64_t * bestdistortion,BLOCK_SIZE bsize,TX_SIZE tx_size,int * y_skip,int64_t rd_thresh)3218 static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
3219     const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
3220     PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
3221     ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
3222     BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) {
3223   const AV1_COMMON *const cm = &cpi->common;
3224   PREDICTION_MODE mode;
3225   MACROBLOCKD *const xd = &x->e_mbd;
3226   assert(!is_inter_block(&xd->mi[0]->mbmi));
3227   int64_t best_rd = rd_thresh;
3228   struct macroblock_plane *p = &x->plane[0];
3229   struct macroblockd_plane *pd = &xd->plane[0];
3230   const int src_stride = p->src.stride;
3231   const int dst_stride = pd->dst.stride;
3232   const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
3233   uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
3234 #if CONFIG_CHROMA_2X2
3235   // TODO(jingning): This is a temporal change. The whole function should be
3236   // out when cb4x4 is enabled.
3237   ENTROPY_CONTEXT ta[4], tempa[4];
3238   ENTROPY_CONTEXT tl[4], templ[4];
3239 #else
3240   ENTROPY_CONTEXT ta[2], tempa[2];
3241   ENTROPY_CONTEXT tl[2], templ[2];
3242 #endif  // CONFIG_CHROMA_2X2
3243 
3244   const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
3245   const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
3246   const int tx_width_unit = tx_size_wide_unit[tx_size];
3247   const int tx_height_unit = tx_size_high_unit[tx_size];
3248   const int pred_block_width = block_size_wide[bsize];
3249   const int pred_block_height = block_size_high[bsize];
3250   const int tx_width = tx_size_wide[tx_size];
3251   const int tx_height = tx_size_high[tx_size];
3252   const int pred_width_in_transform_blocks = pred_block_width / tx_width;
3253   const int pred_height_in_transform_blocks = pred_block_height / tx_height;
3254   int idx, idy;
3255   int best_can_skip = 0;
3256   uint8_t best_dst[8 * 8];
3257 #if CONFIG_HIGHBITDEPTH
3258   uint16_t best_dst16[8 * 8];
3259 #endif  // CONFIG_HIGHBITDEPTH
3260   const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
3261 #if CONFIG_EXT_TX && CONFIG_RECT_TX
3262   const int sub_bsize = bsize;
3263 #else
3264   const int sub_bsize = BLOCK_4X4;
3265 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
3266 
3267 #if CONFIG_PVQ
3268   od_rollback_buffer pre_buf, post_buf;
3269   od_encode_checkpoint(&x->daala_enc, &pre_buf);
3270   od_encode_checkpoint(&x->daala_enc, &post_buf);
3271 #endif  // CONFIG_PVQ
3272 
3273   assert(bsize < BLOCK_8X8);
3274   assert(tx_width < 8 || tx_height < 8);
3275 #if CONFIG_EXT_TX && CONFIG_RECT_TX
3276   if (is_lossless)
3277     assert(tx_width == 4 && tx_height == 4);
3278   else
3279     assert(tx_width == pred_block_width && tx_height == pred_block_height);
3280 #else
3281   assert(tx_width == 4 && tx_height == 4);
3282 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
3283 
3284   memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0]));
3285   memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0]));
3286 
3287   xd->mi[0]->mbmi.tx_size = tx_size;
3288 
3289   xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
3290 
3291 #if CONFIG_HIGHBITDEPTH
3292   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
3293 #if CONFIG_PVQ
3294     od_encode_checkpoint(&x->daala_enc, &pre_buf);
3295 #endif
3296     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
3297       int64_t this_rd;
3298       int ratey = 0;
3299       int64_t distortion = 0;
3300       int rate = bmode_costs[mode];
3301       int can_skip = 1;
3302 
3303       if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
3304             (1 << mode)))
3305         continue;
3306 
3307       // Only do the oblique modes if the best so far is
3308       // one of the neighboring directional modes
3309       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3310         if (conditional_skipintra(mode, *best_mode)) continue;
3311       }
3312 
3313       memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
3314       memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
3315 
3316       for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) {
3317         for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) {
3318           const int block_raster_idx = (row + idy) * 2 + (col + idx);
3319           const int block =
3320               av1_raster_order_to_block_index(tx_size, block_raster_idx);
3321           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
3322           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
3323 #if !CONFIG_PVQ
3324           int16_t *const src_diff = av1_raster_block_offset_int16(
3325               BLOCK_8X8, block_raster_idx, p->src_diff);
3326 #endif
3327           int skip;
3328           assert(block < 4);
3329           assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
3330                          idx == 0 && idy == 0));
3331           assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
3332                          block == 0 || block == 2));
3333           xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
3334           av1_predict_intra_block(
3335               cm, xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode,
3336               dst, dst_stride, dst, dst_stride, col + idx, row + idy, 0);
3337 #if !CONFIG_PVQ
3338           aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
3339                                     src_stride, dst, dst_stride, xd->bd);
3340 #endif
3341           if (is_lossless) {
3342             TX_TYPE tx_type =
3343                 av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
3344             const SCAN_ORDER *scan_order =
3345                 get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
3346             const int coeff_ctx =
3347                 combine_entropy_contexts(tempa[idx], templ[idy]);
3348 #if !CONFIG_PVQ
3349             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
3350                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
3351             ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
3352                                      scan_order, tempa + idx, templ + idy,
3353                                      cpi->sf.use_fast_coef_costing);
3354             skip = (p->eobs[block] == 0);
3355             can_skip &= skip;
3356             tempa[idx] = !skip;
3357             templ[idy] = !skip;
3358 #if CONFIG_EXT_TX
3359             if (tx_size == TX_8X4) {
3360               tempa[idx + 1] = tempa[idx];
3361             } else if (tx_size == TX_4X8) {
3362               templ[idy + 1] = templ[idy];
3363             }
3364 #endif  // CONFIG_EXT_TX
3365 #else
3366             (void)scan_order;
3367 
3368             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
3369                             tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
3370 
3371             ratey += x->rate;
3372             skip = x->pvq_skip[0];
3373             tempa[idx] = !skip;
3374             templ[idy] = !skip;
3375             can_skip &= skip;
3376 #endif
3377             if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
3378               goto next_highbd;
3379 #if CONFIG_PVQ
3380             if (!skip)
3381 #endif
3382               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
3383 #if CONFIG_LGT_FROM_PRED
3384                                           mode,
3385 #endif
3386 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3387                                           BLOCK_OFFSET(xd->mrc_mask, block),
3388 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3389                                           DCT_DCT, tx_size, dst, dst_stride,
3390                                           p->eobs[block]);
3391           } else {
3392             int64_t dist;
3393             unsigned int tmp;
3394             TX_TYPE tx_type =
3395                 av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
3396             const SCAN_ORDER *scan_order =
3397                 get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
3398             const int coeff_ctx =
3399                 combine_entropy_contexts(tempa[idx], templ[idy]);
3400 #if !CONFIG_PVQ
3401 #if DISABLE_TRELLISQ_SEARCH
3402             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
3403                             tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
3404 #else
3405             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
3406                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
3407             av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
3408                            tempa + idx, templ + idy, 1);
3409 #endif  // DISABLE_TRELLISQ_SEARCH
3410             ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
3411                                      scan_order, tempa + idx, templ + idy,
3412                                      cpi->sf.use_fast_coef_costing);
3413             skip = (p->eobs[block] == 0);
3414             can_skip &= skip;
3415             tempa[idx] = !skip;
3416             templ[idy] = !skip;
3417 #if CONFIG_EXT_TX
3418             if (tx_size == TX_8X4) {
3419               tempa[idx + 1] = tempa[idx];
3420             } else if (tx_size == TX_4X8) {
3421               templ[idy + 1] = templ[idy];
3422             }
3423 #endif  // CONFIG_EXT_TX
3424 #else
3425             (void)scan_order;
3426 
3427             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
3428                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
3429             ratey += x->rate;
3430             skip = x->pvq_skip[0];
3431             tempa[idx] = !skip;
3432             templ[idy] = !skip;
3433             can_skip &= skip;
3434 #endif
3435 #if CONFIG_PVQ
3436             if (!skip)
3437 #endif
3438               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
3439 #if CONFIG_LGT_FROM_PRED
3440                                           mode,
3441 #endif
3442 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3443                                           BLOCK_OFFSET(xd->mrc_mask, block),
3444 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3445                                           tx_type, tx_size, dst, dst_stride,
3446                                           p->eobs[block]);
3447             cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
3448             dist = (int64_t)tmp << 4;
3449             distortion += dist;
3450             if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
3451               goto next_highbd;
3452           }
3453         }
3454       }
3455 
3456       rate += ratey;
3457       this_rd = RDCOST(x->rdmult, rate, distortion);
3458 
3459       if (this_rd < best_rd) {
3460         *bestrate = rate;
3461         *bestratey = ratey;
3462         *bestdistortion = distortion;
3463         best_rd = this_rd;
3464         best_can_skip = can_skip;
3465         *best_mode = mode;
3466         memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
3467         memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
3468 #if CONFIG_PVQ
3469         od_encode_checkpoint(&x->daala_enc, &post_buf);
3470 #endif
3471         for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
3472           memcpy(best_dst16 + idy * 8,
3473                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
3474                  pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
3475         }
3476       }
3477     next_highbd : {}
3478 #if CONFIG_PVQ
3479       od_encode_rollback(&x->daala_enc, &pre_buf);
3480 #endif
3481     }
3482 
3483     if (best_rd >= rd_thresh) return best_rd;
3484 
3485 #if CONFIG_PVQ
3486     od_encode_rollback(&x->daala_enc, &post_buf);
3487 #endif
3488 
3489     if (y_skip) *y_skip &= best_can_skip;
3490 
3491     for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
3492       memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
3493              best_dst16 + idy * 8,
3494              pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
3495     }
3496 
3497     return best_rd;
3498   }
3499 #endif  // CONFIG_HIGHBITDEPTH
3500 
3501 #if CONFIG_PVQ
3502   od_encode_checkpoint(&x->daala_enc, &pre_buf);
3503 #endif  // CONFIG_PVQ
3504 
3505   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
3506     int64_t this_rd;
3507     int ratey = 0;
3508     int64_t distortion = 0;
3509     int rate = bmode_costs[mode];
3510     int can_skip = 1;
3511 
3512     if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
3513           (1 << mode))) {
3514       continue;
3515     }
3516 
3517     // Only do the oblique modes if the best so far is
3518     // one of the neighboring directional modes
3519     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3520       if (conditional_skipintra(mode, *best_mode)) continue;
3521     }
3522 
3523     memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
3524     memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
3525 
3526     for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) {
3527       for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) {
3528         const int block_raster_idx = (row + idy) * 2 + (col + idx);
3529         int block = av1_raster_order_to_block_index(tx_size, block_raster_idx);
3530         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
3531         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
3532 #if !CONFIG_PVQ
3533         int16_t *const src_diff = av1_raster_block_offset_int16(
3534             BLOCK_8X8, block_raster_idx, p->src_diff);
3535 #endif  // !CONFIG_PVQ
3536         int skip;
3537         assert(block < 4);
3538         assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
3539                        idx == 0 && idy == 0));
3540         assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
3541                        block == 0 || block == 2));
3542         xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
3543         av1_predict_intra_block(cm, xd, pd->width, pd->height,
3544                                 txsize_to_bsize[tx_size], mode, dst, dst_stride,
3545                                 dst, dst_stride,
3546 #if CONFIG_CB4X4
3547                                 2 * (col + idx), 2 * (row + idy),
3548 #else
3549                                 col + idx, row + idy,
3550 #endif  // CONFIG_CB4X4
3551                                 0);
3552 #if !CONFIG_PVQ
3553         aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
3554                            dst, dst_stride);
3555 #endif  // !CONFIG_PVQ
3556         TX_TYPE tx_type =
3557             av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
3558         const SCAN_ORDER *scan_order =
3559             get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
3560         const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
3561 #if CONFIG_CB4X4
3562         block = 4 * block;
3563 #endif  // CONFIG_CB4X4
3564 #if !CONFIG_PVQ
3565 #if DISABLE_TRELLISQ_SEARCH
3566         av1_xform_quant(cm, x, 0, block,
3567 #if CONFIG_CB4X4
3568                         2 * (row + idy), 2 * (col + idx),
3569 #else
3570                         row + idy, col + idx,
3571 #endif  // CONFIG_CB4X4
3572                         BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
3573 #else
3574         const AV1_XFORM_QUANT xform_quant =
3575             is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
3576         av1_xform_quant(cm, x, 0, block,
3577 #if CONFIG_CB4X4
3578                         2 * (row + idy), 2 * (col + idx),
3579 #else
3580                         row + idy, col + idx,
3581 #endif  // CONFIG_CB4X4
3582                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
3583 
3584         av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
3585                        templ + idy, 1);
3586 #endif  // DISABLE_TRELLISQ_SEARCH
3587         ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
3588                                  tempa + idx, templ + idy,
3589                                  cpi->sf.use_fast_coef_costing);
3590         skip = (p->eobs[block] == 0);
3591         can_skip &= skip;
3592         tempa[idx] = !skip;
3593         templ[idy] = !skip;
3594 #if CONFIG_EXT_TX
3595         if (tx_size == TX_8X4) {
3596           tempa[idx + 1] = tempa[idx];
3597         } else if (tx_size == TX_4X8) {
3598           templ[idy + 1] = templ[idy];
3599         }
3600 #endif  // CONFIG_EXT_TX
3601 #else
3602         (void)scan_order;
3603 
3604         av1_xform_quant(cm, x, 0, block,
3605 #if CONFIG_CB4X4
3606                         2 * (row + idy), 2 * (col + idx),
3607 #else
3608                         row + idy, col + idx,
3609 #endif  // CONFIG_CB4X4
3610                         BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
3611 
3612         ratey += x->rate;
3613         skip = x->pvq_skip[0];
3614         tempa[idx] = !skip;
3615         templ[idy] = !skip;
3616         can_skip &= skip;
3617 #endif  // !CONFIG_PVQ
3618 
3619         if (!is_lossless) {  // To use the pixel domain distortion, we need to
3620                              // calculate inverse txfm *before* calculating RD
3621                              // cost. Compared to calculating the distortion in
3622                              // the frequency domain, the overhead of encoding
3623                              // effort is low.
3624 #if CONFIG_PVQ
3625           if (!skip)
3626 #endif  // CONFIG_PVQ
3627             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
3628 #if CONFIG_LGT_FROM_PRED
3629                                         mode,
3630 #endif
3631 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3632                                         BLOCK_OFFSET(xd->mrc_mask, block),
3633 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3634                                         tx_type, tx_size, dst, dst_stride,
3635                                         p->eobs[block]);
3636           unsigned int tmp;
3637           cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
3638           const int64_t dist = (int64_t)tmp << 4;
3639           distortion += dist;
3640         }
3641 
3642         if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) goto next;
3643 
3644         if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
3645 #if CONFIG_PVQ
3646           if (!skip)
3647 #endif  // CONFIG_PVQ
3648             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
3649 #if CONFIG_LGT_FROM_PRED
3650                                         mode,
3651 #endif
3652 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3653                                         BLOCK_OFFSET(xd->mrc_mask, block),
3654 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3655                                         DCT_DCT, tx_size, dst, dst_stride,
3656                                         p->eobs[block]);
3657         }
3658       }
3659     }
3660 
3661     rate += ratey;
3662     this_rd = RDCOST(x->rdmult, rate, distortion);
3663 
3664     if (this_rd < best_rd) {
3665       *bestrate = rate;
3666       *bestratey = ratey;
3667       *bestdistortion = distortion;
3668       best_rd = this_rd;
3669       best_can_skip = can_skip;
3670       *best_mode = mode;
3671       memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
3672       memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
3673 #if CONFIG_PVQ
3674       od_encode_checkpoint(&x->daala_enc, &post_buf);
3675 #endif  // CONFIG_PVQ
3676       for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
3677         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
3678                pred_width_in_transform_blocks * 4);
3679     }
3680   next : {}
3681 #if CONFIG_PVQ
3682     od_encode_rollback(&x->daala_enc, &pre_buf);
3683 #endif  // CONFIG_PVQ
3684   }     // mode decision loop
3685 
3686   if (best_rd >= rd_thresh) return best_rd;
3687 
3688 #if CONFIG_PVQ
3689   od_encode_rollback(&x->daala_enc, &post_buf);
3690 #endif  // CONFIG_PVQ
3691 
3692   if (y_skip) *y_skip &= best_can_skip;
3693 
3694   for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
3695     memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
3696            pred_width_in_transform_blocks * 4);
3697 
3698   return best_rd;
3699 }
3700 
rd_pick_intra_sub_8x8_y_mode(const AV1_COMP * const cpi,MACROBLOCK * mb,int * rate,int * rate_y,int64_t * distortion,int * y_skip,int64_t best_rd)3701 static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
3702                                             MACROBLOCK *mb, int *rate,
3703                                             int *rate_y, int64_t *distortion,
3704                                             int *y_skip, int64_t best_rd) {
3705   const MACROBLOCKD *const xd = &mb->e_mbd;
3706   MODE_INFO *const mic = xd->mi[0];
3707   const MODE_INFO *above_mi = xd->above_mi;
3708   const MODE_INFO *left_mi = xd->left_mi;
3709   MB_MODE_INFO *const mbmi = &mic->mbmi;
3710   assert(!is_inter_block(mbmi));
3711   const BLOCK_SIZE bsize = mbmi->sb_type;
3712   const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
3713   const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
3714   int idx, idy;
3715   int cost = 0;
3716   int64_t total_distortion = 0;
3717   int tot_rate_y = 0;
3718   int64_t total_rd = 0;
3719   const int *bmode_costs = mb->mbmode_cost[0];
3720   const int is_lossless = xd->lossless[mbmi->segment_id];
3721 #if CONFIG_EXT_TX && CONFIG_RECT_TX
3722   const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
3723 #else
3724   const TX_SIZE tx_size = TX_4X4;
3725 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
3726 
3727 #if CONFIG_EXT_INTRA
3728 #if CONFIG_INTRA_INTERP
3729   mbmi->intra_filter = INTRA_FILTER_LINEAR;
3730 #endif  // CONFIG_INTRA_INTERP
3731 #endif  // CONFIG_EXT_INTRA
3732 #if CONFIG_FILTER_INTRA
3733   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
3734 #endif  // CONFIG_FILTER_INTRA
3735 
3736   // TODO(any): Add search of the tx_type to improve rd performance at the
3737   // expense of speed.
3738   mbmi->tx_type = DCT_DCT;
3739   mbmi->tx_size = tx_size;
3740 #if CONFIG_LGT_FROM_PRED
3741   mbmi->use_lgt = 0;
3742 #endif
3743 
3744   if (y_skip) *y_skip = 1;
3745 
3746   // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this
3747   // 8x8 coding block.
3748   for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) {
3749     for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) {
3750       PREDICTION_MODE best_mode = DC_PRED;
3751       int r = INT_MAX, ry = INT_MAX;
3752       int64_t d = INT64_MAX, this_rd = INT64_MAX;
3753       int j;
3754       const int pred_block_idx = idy * 2 + idx;
3755       if (cpi->common.frame_type == KEY_FRAME) {
3756         const PREDICTION_MODE A =
3757             av1_above_block_mode(mic, above_mi, pred_block_idx);
3758         const PREDICTION_MODE L =
3759             av1_left_block_mode(mic, left_mi, pred_block_idx);
3760 
3761 #if CONFIG_KF_CTX
3762         const int above_ctx = intra_mode_context[A];
3763         const int left_ctx = intra_mode_context[L];
3764         bmode_costs = mb->y_mode_costs[above_ctx][left_ctx];
3765 #else
3766         bmode_costs = mb->y_mode_costs[A][L];
3767 #endif
3768       }
3769       this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
3770           cpi, mb, idy, idx, &best_mode, bmode_costs,
3771           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
3772           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
3773 #if CONFIG_DIST_8X8
3774       if (!cpi->oxcf.using_dist_8x8)
3775 #endif
3776         if (this_rd >= best_rd - total_rd) return INT64_MAX;
3777 
3778       total_rd += this_rd;
3779       cost += r;
3780       total_distortion += d;
3781       tot_rate_y += ry;
3782 
3783       mic->bmi[pred_block_idx].as_mode = best_mode;
3784       for (j = 1; j < pred_height_in_4x4_blocks; ++j)
3785         mic->bmi[pred_block_idx + j * 2].as_mode = best_mode;
3786       for (j = 1; j < pred_width_in_4x4_blocks; ++j)
3787         mic->bmi[pred_block_idx + j].as_mode = best_mode;
3788 
3789       if (total_rd >= best_rd) return INT64_MAX;
3790     }
3791   }
3792   mbmi->mode = mic->bmi[3].as_mode;
3793 
3794 #if CONFIG_DIST_8X8
3795   if (cpi->oxcf.using_dist_8x8) {
3796     const struct macroblock_plane *p = &mb->plane[0];
3797     const struct macroblockd_plane *pd = &xd->plane[0];
3798     const int src_stride = p->src.stride;
3799     const int dst_stride = pd->dst.stride;
3800     uint8_t *src = p->src.buf;
3801     uint8_t *dst = pd->dst.buf;
3802 
3803     // Daala-defined distortion computed for the block of 8x8 pixels
3804     total_distortion = av1_dist_8x8(cpi, mb, src, src_stride, dst, dst_stride,
3805                                     BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
3806                        << 4;
3807   }
3808 #endif  // CONFIG_DIST_8X8
3809   // Add in the cost of the transform type
3810   if (!is_lossless) {
3811     int rate_tx_type = 0;
3812 #if CONFIG_EXT_TX
3813     if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) >
3814         1) {
3815       const int eset =
3816           get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
3817 #if CONFIG_LGT_FROM_PRED
3818       if (LGT_FROM_PRED_INTRA && is_lgt_allowed(mbmi->mode, tx_size))
3819         rate_tx_type += mb->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode]
3820                                           [mbmi->use_lgt];
3821       if (!LGT_FROM_PRED_INTRA || !mbmi->use_lgt)
3822 #endif  // CONFIG_LGT_FROM_PRED
3823         rate_tx_type += mb->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
3824                                                [mbmi->mode][mbmi->tx_type];
3825     }
3826 #else
3827     rate_tx_type =
3828         mb->intra_tx_type_costs[txsize_sqr_map[tx_size]]
3829                                [intra_mode_to_tx_type_context[mbmi->mode]]
3830                                [mbmi->tx_type];
3831 #endif  // CONFIG_EXT_TX
3832     assert(mbmi->tx_size == tx_size);
3833     cost += rate_tx_type;
3834     tot_rate_y += rate_tx_type;
3835   }
3836 
3837   *rate = cost;
3838   *rate_y = tot_rate_y;
3839   *distortion = total_distortion;
3840 
3841   return RDCOST(mb->rdmult, cost, total_distortion);
3842 }
3843 
3844 #if CONFIG_FILTER_INTRA
3845 // Return 1 if an filter intra mode is selected; return 0 otherwise.
rd_pick_filter_intra_sby(const AV1_COMP * const cpi,MACROBLOCK * x,int * rate,int * rate_tokenonly,int64_t * distortion,int * skippable,BLOCK_SIZE bsize,int mode_cost,int64_t * best_rd,int64_t * best_model_rd,uint16_t skip_mask)3846 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
3847                                     int *rate, int *rate_tokenonly,
3848                                     int64_t *distortion, int *skippable,
3849                                     BLOCK_SIZE bsize, int mode_cost,
3850                                     int64_t *best_rd, int64_t *best_model_rd,
3851                                     uint16_t skip_mask) {
3852   MACROBLOCKD *const xd = &x->e_mbd;
3853   MODE_INFO *const mic = xd->mi[0];
3854   MB_MODE_INFO *mbmi = &mic->mbmi;
3855   int filter_intra_selected_flag = 0;
3856   FILTER_INTRA_MODE mode;
3857   TX_SIZE best_tx_size = TX_4X4;
3858   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
3859   TX_TYPE best_tx_type;
3860 #if CONFIG_LGT_FROM_PRED
3861   int use_lgt_when_selected;
3862 #endif
3863 
3864   av1_zero(filter_intra_mode_info);
3865   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
3866   mbmi->mode = DC_PRED;
3867   mbmi->palette_mode_info.palette_size[0] = 0;
3868 
3869   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
3870     int this_rate;
3871     int64_t this_rd, this_model_rd;
3872     RD_STATS tokenonly_rd_stats;
3873     if (skip_mask & (1 << mode)) continue;
3874     mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
3875     this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
3876     if (*best_model_rd != INT64_MAX &&
3877         this_model_rd > *best_model_rd + (*best_model_rd >> 1))
3878       continue;
3879     if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
3880     super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
3881     if (tokenonly_rd_stats.rate == INT_MAX) continue;
3882     this_rate = tokenonly_rd_stats.rate +
3883                 av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
3884                 write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
3885     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
3886 
3887     if (this_rd < *best_rd) {
3888       *best_rd = this_rd;
3889       best_tx_size = mic->mbmi.tx_size;
3890       filter_intra_mode_info = mbmi->filter_intra_mode_info;
3891       best_tx_type = mic->mbmi.tx_type;
3892 #if CONFIG_LGT_FROM_PRED
3893       use_lgt_when_selected = mic->mbmi.use_lgt;
3894 #endif
3895       *rate = this_rate;
3896       *rate_tokenonly = tokenonly_rd_stats.rate;
3897       *distortion = tokenonly_rd_stats.dist;
3898       *skippable = tokenonly_rd_stats.skip;
3899       filter_intra_selected_flag = 1;
3900     }
3901   }
3902 
3903   if (filter_intra_selected_flag) {
3904     mbmi->mode = DC_PRED;
3905     mbmi->tx_size = best_tx_size;
3906 #if CONFIG_LGT_FROM_PRED
3907     mbmi->use_lgt = use_lgt_when_selected;
3908 #endif
3909     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
3910         filter_intra_mode_info.use_filter_intra_mode[0];
3911     mbmi->filter_intra_mode_info.filter_intra_mode[0] =
3912         filter_intra_mode_info.filter_intra_mode[0];
3913     mbmi->tx_type = best_tx_type;
3914     return 1;
3915   } else {
3916     return 0;
3917   }
3918 }
3919 #endif  // CONFIG_FILTER_INTRA
3920 
3921 #if CONFIG_EXT_INTRA
3922 // Run RD calculation with given luma intra prediction angle., and return
3923 // the RD cost. Update the best mode info. if the RD cost is the best so far.
calc_rd_given_intra_angle(const AV1_COMP * const cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int mode_cost,int64_t best_rd_in,int8_t angle_delta,int max_angle_delta,int * rate,RD_STATS * rd_stats,int * best_angle_delta,TX_SIZE * best_tx_size,TX_TYPE * best_tx_type,int * use_lgt_when_selected,INTRA_FILTER * best_filter,int64_t * best_rd,int64_t * best_model_rd)3924 static int64_t calc_rd_given_intra_angle(
3925     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
3926     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
3927     RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
3928     TX_TYPE *best_tx_type,
3929 #if CONFIG_LGT_FROM_PRED
3930     int *use_lgt_when_selected,
3931 #endif
3932 #if CONFIG_INTRA_INTERP
3933     INTRA_FILTER *best_filter,
3934 #endif  // CONFIG_INTRA_INTERP
3935     int64_t *best_rd, int64_t *best_model_rd) {
3936   int this_rate;
3937   RD_STATS tokenonly_rd_stats;
3938   int64_t this_rd, this_model_rd;
3939   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
3940   assert(!is_inter_block(mbmi));
3941 
3942   mbmi->angle_delta[0] = angle_delta;
3943   this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
3944   if (*best_model_rd != INT64_MAX &&
3945       this_model_rd > *best_model_rd + (*best_model_rd >> 1))
3946     return INT64_MAX;
3947   if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
3948   super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
3949   if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
3950 
3951   this_rate = tokenonly_rd_stats.rate + mode_cost +
3952               write_uniform_cost(2 * max_angle_delta + 1,
3953                                  mbmi->angle_delta[0] + max_angle_delta);
3954   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
3955 
3956   if (this_rd < *best_rd) {
3957     *best_rd = this_rd;
3958     *best_angle_delta = mbmi->angle_delta[0];
3959     *best_tx_size = mbmi->tx_size;
3960 #if CONFIG_INTRA_INTERP
3961     *best_filter = mbmi->intra_filter;
3962 #endif  // CONFIG_INTRA_INTERP
3963     *best_tx_type = mbmi->tx_type;
3964 #if CONFIG_LGT_FROM_PRED
3965     *use_lgt_when_selected = mbmi->use_lgt;
3966 #endif
3967     *rate = this_rate;
3968     rd_stats->rate = tokenonly_rd_stats.rate;
3969     rd_stats->dist = tokenonly_rd_stats.dist;
3970     rd_stats->skip = tokenonly_rd_stats.skip;
3971   }
3972   return this_rd;
3973 }
3974 
3975 // With given luma directional intra prediction mode, pick the best angle delta
3976 // Return the RD cost corresponding to the best angle delta.
rd_pick_intra_angle_sby(const AV1_COMP * const cpi,MACROBLOCK * x,int * rate,RD_STATS * rd_stats,BLOCK_SIZE bsize,int mode_cost,int64_t best_rd,int64_t * best_model_rd)3977 static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
3978                                        int *rate, RD_STATS *rd_stats,
3979                                        BLOCK_SIZE bsize, int mode_cost,
3980                                        int64_t best_rd,
3981                                        int64_t *best_model_rd) {
3982   MACROBLOCKD *const xd = &x->e_mbd;
3983   MODE_INFO *const mic = xd->mi[0];
3984   MB_MODE_INFO *mbmi = &mic->mbmi;
3985   assert(!is_inter_block(mbmi));
3986   int i, angle_delta, best_angle_delta = 0;
3987   int first_try = 1;
3988 #if CONFIG_INTRA_INTERP
3989   int p_angle;
3990   const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
3991   INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
3992 #endif  // CONFIG_INTRA_INTERP
3993   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
3994   TX_SIZE best_tx_size = mic->mbmi.tx_size;
3995   TX_TYPE best_tx_type = mbmi->tx_type;
3996 #if CONFIG_LGT_FROM_PRED
3997   int use_lgt_when_selected = mbmi->use_lgt;
3998 #endif
3999 
4000   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
4001 
4002   for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
4003 #if CONFIG_INTRA_INTERP
4004     for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
4005       if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
4006       mic->mbmi.intra_filter = filter;
4007 #endif  // CONFIG_INTRA_INTERP
4008       for (i = 0; i < 2; ++i) {
4009         best_rd_in = (best_rd == INT64_MAX)
4010                          ? INT64_MAX
4011                          : (best_rd + (best_rd >> (first_try ? 3 : 5)));
4012         this_rd = calc_rd_given_intra_angle(
4013             cpi, x, bsize,
4014 #if CONFIG_INTRA_INTERP
4015             mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
4016 #else
4017           mode_cost,
4018 #endif  // CONFIG_INTRA_INTERP
4019             best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
4020             rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
4021 #if CONFIG_LGT_FROM_PRED
4022             &use_lgt_when_selected,
4023 #endif
4024 #if CONFIG_INTRA_INTERP
4025             &best_filter,
4026 #endif  // CONFIG_INTRA_INTERP
4027             &best_rd, best_model_rd);
4028         rd_cost[2 * angle_delta + i] = this_rd;
4029         if (first_try && this_rd == INT64_MAX) return best_rd;
4030         first_try = 0;
4031         if (angle_delta == 0) {
4032           rd_cost[1] = this_rd;
4033           break;
4034         }
4035       }
4036 #if CONFIG_INTRA_INTERP
4037     }
4038 #endif  // CONFIG_INTRA_INTERP
4039   }
4040 
4041   assert(best_rd != INT64_MAX);
4042   for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
4043     int64_t rd_thresh;
4044 #if CONFIG_INTRA_INTERP
4045     for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
4046       if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
4047       mic->mbmi.intra_filter = filter;
4048 #endif  // CONFIG_INTRA_INTERP
4049       for (i = 0; i < 2; ++i) {
4050         int skip_search = 0;
4051         rd_thresh = best_rd + (best_rd >> 5);
4052         if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
4053             rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
4054           skip_search = 1;
4055         if (!skip_search) {
4056           calc_rd_given_intra_angle(
4057               cpi, x, bsize,
4058 #if CONFIG_INTRA_INTERP
4059               mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
4060 #else
4061             mode_cost,
4062 #endif  // CONFIG_INTRA_INTERP
4063               best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
4064               rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
4065 #if CONFIG_LGT_FROM_PRED
4066               &use_lgt_when_selected,
4067 #endif
4068 #if CONFIG_INTRA_INTERP
4069               &best_filter,
4070 #endif  // CONFIG_INTRA_INTERP
4071               &best_rd, best_model_rd);
4072         }
4073       }
4074 #if CONFIG_INTRA_INTERP
4075     }
4076 #endif  // CONFIG_INTRA_INTERP
4077   }
4078 
4079 #if CONFIG_INTRA_INTERP
4080   if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) {
4081     p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP;
4082     if (av1_is_intra_filter_switchable(p_angle)) {
4083       for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
4084         mic->mbmi.intra_filter = filter;
4085         this_rd = calc_rd_given_intra_angle(
4086             cpi, x, bsize,
4087             mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], best_rd,
4088             best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
4089             &best_angle_delta, &best_tx_size, &best_tx_type,
4090 #if CONFIG_LGT_FROM_PRED
4091             &use_lgt_when_selected,
4092 #endif
4093             &best_filter, &best_rd, best_model_rd);
4094       }
4095     }
4096   }
4097 #endif  // CONFIG_INTRA_INTERP
4098 
4099   mbmi->tx_size = best_tx_size;
4100   mbmi->angle_delta[0] = best_angle_delta;
4101 #if CONFIG_INTRA_INTERP
4102   mic->mbmi.intra_filter = best_filter;
4103 #endif  // CONFIG_INTRA_INTERP
4104   mbmi->tx_type = best_tx_type;
4105 #if CONFIG_LGT_FROM_PRED
4106   mbmi->use_lgt = use_lgt_when_selected;
4107 #endif
4108   return best_rd;
4109 }
4110 
4111 // Indices are sign, integer, and fractional part of the gradient value
4112 static const uint8_t gradient_to_angle_bin[2][7][16] = {
4113   {
4114       { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
4115       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
4116       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
4117       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
4118       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
4119       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
4120       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
4121   },
4122   {
4123       { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
4124       { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
4125       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
4126       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
4127       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
4128       { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
4129       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
4130   },
4131 };
4132 
4133 /* clang-format off */
4134 static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
4135   0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
4136   0,
4137 };
4138 /* clang-format on */
4139 
angle_estimation(const uint8_t * src,int src_stride,int rows,int cols,BLOCK_SIZE bsize,uint8_t * directional_mode_skip_mask)4140 static void angle_estimation(const uint8_t *src, int src_stride, int rows,
4141                              int cols, BLOCK_SIZE bsize,
4142                              uint8_t *directional_mode_skip_mask) {
4143   memset(directional_mode_skip_mask, 0,
4144          INTRA_MODES * sizeof(*directional_mode_skip_mask));
4145   // Check if angle_delta is used
4146   if (!av1_use_angle_delta(bsize)) return;
4147   uint64_t hist[DIRECTIONAL_MODES];
4148   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
4149   src += src_stride;
4150   int r, c, dx, dy;
4151   for (r = 1; r < rows; ++r) {
4152     for (c = 1; c < cols; ++c) {
4153       dx = src[c] - src[c - 1];
4154       dy = src[c] - src[c - src_stride];
4155       int index;
4156       const int temp = dx * dx + dy * dy;
4157       if (dy == 0) {
4158         index = 2;
4159       } else {
4160         const int sn = (dx > 0) ^ (dy > 0);
4161         dx = abs(dx);
4162         dy = abs(dy);
4163         const int remd = (dx % dy) * 16 / dy;
4164         const int quot = dx / dy;
4165         index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
4166       }
4167       hist[index] += temp;
4168     }
4169     src += src_stride;
4170   }
4171 
4172   int i;
4173   uint64_t hist_sum = 0;
4174   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
4175   for (i = 0; i < INTRA_MODES; ++i) {
4176     if (av1_is_directional_mode(i, bsize)) {
4177       const uint8_t angle_bin = mode_to_angle_bin[i];
4178       uint64_t score = 2 * hist[angle_bin];
4179       int weight = 2;
4180       if (angle_bin > 0) {
4181         score += hist[angle_bin - 1];
4182         ++weight;
4183       }
4184       if (angle_bin < DIRECTIONAL_MODES - 1) {
4185         score += hist[angle_bin + 1];
4186         ++weight;
4187       }
4188       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
4189         directional_mode_skip_mask[i] = 1;
4190     }
4191   }
4192 }
4193 
4194 #if CONFIG_HIGHBITDEPTH
highbd_angle_estimation(const uint8_t * src8,int src_stride,int rows,int cols,BLOCK_SIZE bsize,uint8_t * directional_mode_skip_mask)4195 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
4196                                     int rows, int cols, BLOCK_SIZE bsize,
4197                                     uint8_t *directional_mode_skip_mask) {
4198   memset(directional_mode_skip_mask, 0,
4199          INTRA_MODES * sizeof(*directional_mode_skip_mask));
4200   // Check if angle_delta is used
4201   if (!av1_use_angle_delta(bsize)) return;
4202   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
4203   uint64_t hist[DIRECTIONAL_MODES];
4204   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
4205   src += src_stride;
4206   int r, c, dx, dy;
4207   for (r = 1; r < rows; ++r) {
4208     for (c = 1; c < cols; ++c) {
4209       dx = src[c] - src[c - 1];
4210       dy = src[c] - src[c - src_stride];
4211       int index;
4212       const int temp = dx * dx + dy * dy;
4213       if (dy == 0) {
4214         index = 2;
4215       } else {
4216         const int sn = (dx > 0) ^ (dy > 0);
4217         dx = abs(dx);
4218         dy = abs(dy);
4219         const int remd = (dx % dy) * 16 / dy;
4220         const int quot = dx / dy;
4221         index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
4222       }
4223       hist[index] += temp;
4224     }
4225     src += src_stride;
4226   }
4227 
4228   int i;
4229   uint64_t hist_sum = 0;
4230   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
4231   for (i = 0; i < INTRA_MODES; ++i) {
4232     if (av1_is_directional_mode(i, bsize)) {
4233       const uint8_t angle_bin = mode_to_angle_bin[i];
4234       uint64_t score = 2 * hist[angle_bin];
4235       int weight = 2;
4236       if (angle_bin > 0) {
4237         score += hist[angle_bin - 1];
4238         ++weight;
4239       }
4240       if (angle_bin < DIRECTIONAL_MODES - 1) {
4241         score += hist[angle_bin + 1];
4242         ++weight;
4243       }
4244       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
4245         directional_mode_skip_mask[i] = 1;
4246     }
4247   }
4248 }
4249 #endif  // CONFIG_HIGHBITDEPTH
4250 #endif  // CONFIG_EXT_INTRA
4251 
4252 // This function is used only for intra_only frames
rd_pick_intra_sby_mode(const AV1_COMP * const cpi,MACROBLOCK * x,int * rate,int * rate_tokenonly,int64_t * distortion,int * skippable,BLOCK_SIZE bsize,int64_t best_rd)4253 static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
4254                                       int *rate, int *rate_tokenonly,
4255                                       int64_t *distortion, int *skippable,
4256                                       BLOCK_SIZE bsize, int64_t best_rd) {
4257   MACROBLOCKD *const xd = &x->e_mbd;
4258   MODE_INFO *const mic = xd->mi[0];
4259   MB_MODE_INFO *const mbmi = &mic->mbmi;
4260   assert(!is_inter_block(mbmi));
4261   MB_MODE_INFO best_mbmi = *mbmi;
4262   int64_t best_model_rd = INT64_MAX;
4263 #if CONFIG_EXT_INTRA
4264   const int rows = block_size_high[bsize];
4265   const int cols = block_size_wide[bsize];
4266 #if CONFIG_INTRA_INTERP
4267   const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
4268 #endif  // CONFIG_INTRA_INTERP
4269   int is_directional_mode;
4270   uint8_t directional_mode_skip_mask[INTRA_MODES];
4271   const int src_stride = x->plane[0].src.stride;
4272   const uint8_t *src = x->plane[0].src.buf;
4273 #endif  // CONFIG_EXT_INTRA
4274 #if CONFIG_FILTER_INTRA
4275   int beat_best_rd = 0;
4276   uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
4277 #endif  // CONFIG_FILTER_INTRA
4278   const int *bmode_costs;
4279   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
4280   int palette_y_mode_ctx = 0;
4281   const int try_palette =
4282       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
4283   uint8_t *best_palette_color_map =
4284       try_palette ? x->palette_buffer->best_palette_color_map : NULL;
4285   const MODE_INFO *above_mi = xd->above_mi;
4286   const MODE_INFO *left_mi = xd->left_mi;
4287   const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
4288   const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
4289   const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
4290 #if CONFIG_PVQ
4291   od_rollback_buffer pre_buf, post_buf;
4292 
4293   od_encode_checkpoint(&x->daala_enc, &pre_buf);
4294   od_encode_checkpoint(&x->daala_enc, &post_buf);
4295 #endif  // CONFIG_PVQ
4296 
4297 #if CONFIG_KF_CTX
4298   const int above_ctx = intra_mode_context[A];
4299   const int left_ctx = intra_mode_context[L];
4300   bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
4301 #else
4302   bmode_costs = x->y_mode_costs[A][L];
4303 #endif
4304 
4305 #if CONFIG_EXT_INTRA
4306   mbmi->angle_delta[0] = 0;
4307 #if CONFIG_HIGHBITDEPTH
4308   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
4309     highbd_angle_estimation(src, src_stride, rows, cols, bsize,
4310                             directional_mode_skip_mask);
4311   else
4312 #endif  // CONFIG_HIGHBITDEPTH
4313     angle_estimation(src, src_stride, rows, cols, bsize,
4314                      directional_mode_skip_mask);
4315 #endif  // CONFIG_EXT_INTRA
4316 #if CONFIG_FILTER_INTRA
4317   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
4318 #endif  // CONFIG_FILTER_INTRA
4319   pmi->palette_size[0] = 0;
4320   if (try_palette) {
4321     if (above_mi) {
4322       palette_y_mode_ctx +=
4323           (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
4324     }
4325     if (left_mi) {
4326       palette_y_mode_ctx +=
4327           (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
4328     }
4329   }
4330 
4331   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
4332     x->use_default_intra_tx_type = 1;
4333   else
4334     x->use_default_intra_tx_type = 0;
4335 
4336   /* Y Search for intra prediction mode */
4337   for (int mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
4338     RD_STATS this_rd_stats;
4339     int this_rate, this_rate_tokenonly, s;
4340     int64_t this_distortion, this_rd, this_model_rd;
4341     if (mode_idx == FINAL_MODE_SEARCH) {
4342       if (x->use_default_intra_tx_type == 0) break;
4343       mbmi->mode = best_mbmi.mode;
4344       x->use_default_intra_tx_type = 0;
4345     } else {
4346       assert(mode_idx < INTRA_MODES);
4347       mbmi->mode = intra_rd_search_mode_order[mode_idx];
4348     }
4349 #if CONFIG_PVQ
4350     od_encode_rollback(&x->daala_enc, &pre_buf);
4351 #endif  // CONFIG_PVQ
4352 #if CONFIG_EXT_INTRA
4353     mbmi->angle_delta[0] = 0;
4354 #endif  // CONFIG_EXT_INTRA
4355     this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
4356     if (best_model_rd != INT64_MAX &&
4357         this_model_rd > best_model_rd + (best_model_rd >> 1))
4358       continue;
4359     if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
4360 #if CONFIG_EXT_INTRA
4361     is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
4362     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
4363     if (is_directional_mode && av1_use_angle_delta(bsize)) {
4364       this_rd_stats.rate = INT_MAX;
4365       rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
4366                               bmode_costs[mbmi->mode], best_rd, &best_model_rd);
4367     } else {
4368       super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
4369     }
4370 #else
4371     super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
4372 #endif  // CONFIG_EXT_INTRA
4373     this_rate_tokenonly = this_rd_stats.rate;
4374     this_distortion = this_rd_stats.dist;
4375     s = this_rd_stats.skip;
4376 
4377     if (this_rate_tokenonly == INT_MAX) continue;
4378 
4379     this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
4380 
4381     if (!xd->lossless[mbmi->segment_id] &&
4382         block_signals_txsize(mbmi->sb_type)) {
4383       // super_block_yrd above includes the cost of the tx_size in the
4384       // tokenonly rate, but for intra blocks, tx_size is always coded
4385       // (prediction granularity), so we account for it in the full rate,
4386       // not the tokenonly rate.
4387       this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
4388     }
4389     if (try_palette && mbmi->mode == DC_PRED) {
4390       this_rate +=
4391           av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
4392                                                       [palette_y_mode_ctx],
4393                        0);
4394     }
4395 #if CONFIG_FILTER_INTRA
4396     if (mbmi->mode == DC_PRED)
4397       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
4398 #endif  // CONFIG_FILTER_INTRA
4399 #if CONFIG_EXT_INTRA
4400     if (is_directional_mode) {
4401 #if CONFIG_INTRA_INTERP
4402       const int p_angle =
4403           mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
4404       if (av1_is_intra_filter_switchable(p_angle))
4405         this_rate += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
4406 #endif  // CONFIG_INTRA_INTERP
4407       if (av1_use_angle_delta(bsize)) {
4408         this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
4409                                         MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
4410       }
4411     }
4412 #endif  // CONFIG_EXT_INTRA
4413 #if CONFIG_INTRABC
4414     if (bsize >= BLOCK_8X8 && cpi->common.allow_screen_content_tools)
4415       this_rate += x->intrabc_cost[0];
4416 #endif  // CONFIG_INTRABC
4417     this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
4418 #if CONFIG_FILTER_INTRA
4419     if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
4420       filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
4421     }
4422 #endif  // CONFIG_FILTER_INTRA
4423 
4424     if (this_rd < best_rd) {
4425       best_mbmi = *mbmi;
4426       best_rd = this_rd;
4427 #if CONFIG_FILTER_INTRA
4428       beat_best_rd = 1;
4429 #endif  // CONFIG_FILTER_INTRA
4430       *rate = this_rate;
4431       *rate_tokenonly = this_rate_tokenonly;
4432       *distortion = this_distortion;
4433       *skippable = s;
4434 #if CONFIG_PVQ
4435       od_encode_checkpoint(&x->daala_enc, &post_buf);
4436 #endif  // CONFIG_PVQ
4437     }
4438   }
4439 
4440 #if CONFIG_PVQ
4441   od_encode_rollback(&x->daala_enc, &post_buf);
4442 #endif  // CONFIG_PVQ
4443 
4444   if (try_palette) {
4445     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
4446                               bmode_costs[DC_PRED], &best_mbmi,
4447                               best_palette_color_map, &best_rd, &best_model_rd,
4448                               rate, rate_tokenonly, distortion, skippable);
4449   }
4450 
4451 #if CONFIG_FILTER_INTRA
4452   if (beat_best_rd) {
4453     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
4454                                  skippable, bsize, bmode_costs[DC_PRED],
4455                                  &best_rd, &best_model_rd,
4456                                  filter_intra_mode_skip_mask)) {
4457       best_mbmi = *mbmi;
4458     }
4459   }
4460 #endif  // CONFIG_FILTER_INTRA
4461 
4462   *mbmi = best_mbmi;
4463   return best_rd;
4464 }
4465 
4466 // Return value 0: early termination triggered, no valid rd cost available;
4467 //              1: rd cost values are valid.
super_block_uvrd(const AV1_COMP * const cpi,MACROBLOCK * x,RD_STATS * rd_stats,BLOCK_SIZE bsize,int64_t ref_best_rd)4468 static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
4469                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
4470                             int64_t ref_best_rd) {
4471   MACROBLOCKD *const xd = &x->e_mbd;
4472   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4473   const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
4474   int plane;
4475   int is_cost_valid = 1;
4476   av1_init_rd_stats(rd_stats);
4477 
4478   if (ref_best_rd < 0) is_cost_valid = 0;
4479 
4480 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
4481   if (x->skip_chroma_rd) return is_cost_valid;
4482 
4483   bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x,
4484                              xd->plane[1].subsampling_y);
4485 #endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
4486 
4487 #if !CONFIG_PVQ
4488   if (is_inter_block(mbmi) && is_cost_valid) {
4489     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
4490       av1_subtract_plane(x, bsize, plane);
4491   }
4492 #endif  // !CONFIG_PVQ
4493 
4494   if (is_cost_valid) {
4495     for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
4496       RD_STATS pn_rd_stats;
4497       txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
4498                        uv_tx_size, cpi->sf.use_fast_coef_costing);
4499       if (pn_rd_stats.rate == INT_MAX) {
4500         is_cost_valid = 0;
4501         break;
4502       }
4503       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
4504       if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
4505           RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
4506         is_cost_valid = 0;
4507         break;
4508       }
4509     }
4510   }
4511 
4512   if (!is_cost_valid) {
4513     // reset cost value
4514     av1_invalid_rd_stats(rd_stats);
4515   }
4516 
4517   return is_cost_valid;
4518 }
4519 
4520 #if CONFIG_VAR_TX
av1_tx_block_rd_b(const AV1_COMP * cpi,MACROBLOCK * x,TX_SIZE tx_size,int blk_row,int blk_col,int plane,int block,int plane_bsize,const ENTROPY_CONTEXT * a,const ENTROPY_CONTEXT * l,RD_STATS * rd_stats)4521 void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
4522                        int blk_row, int blk_col, int plane, int block,
4523                        int plane_bsize, const ENTROPY_CONTEXT *a,
4524                        const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) {
4525   const AV1_COMMON *const cm = &cpi->common;
4526   MACROBLOCKD *xd = &x->e_mbd;
4527   const struct macroblock_plane *const p = &x->plane[plane];
4528   struct macroblockd_plane *const pd = &xd->plane[plane];
4529 
4530 #if CONFIG_TXK_SEL
4531   av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
4532                       tx_size, a, l, 0, rd_stats);
4533   return;
4534 #endif
4535 
4536   int64_t tmp;
4537   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
4538 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
4539   uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
4540 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
4541   PLANE_TYPE plane_type = get_plane_type(plane);
4542   TX_TYPE tx_type =
4543       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
4544   const SCAN_ORDER *const scan_order =
4545       get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
4546   BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
4547   int bh = block_size_high[txm_bsize];
4548   int bw = block_size_wide[txm_bsize];
4549   int src_stride = p->src.stride;
4550   uint8_t *src =
4551       &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
4552   uint8_t *dst =
4553       &pd->dst
4554            .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
4555 #if CONFIG_HIGHBITDEPTH
4556   DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
4557   uint8_t *rec_buffer;
4558 #else
4559   DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
4560 #endif  // CONFIG_HIGHBITDEPTH
4561   const int diff_stride = block_size_wide[plane_bsize];
4562   const int16_t *diff =
4563       &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
4564   int txb_coeff_cost;
4565 
4566   assert(tx_size < TX_SIZES_ALL);
4567 
4568   int coeff_ctx = get_entropy_context(tx_size, a, l);
4569 
4570   tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
4571                         plane_bsize, txm_bsize);
4572 
4573 #if CONFIG_HIGHBITDEPTH
4574   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
4575     tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
4576 #endif  // CONFIG_HIGHBITDEPTH
4577   rd_stats->sse += tmp << 4;
4578 
4579   if (rd_stats->invalid_rate) {
4580     rd_stats->dist += tmp << 4;
4581     rd_stats->rate += rd_stats->zero_rate;
4582     rd_stats->skip = 1;
4583     return;
4584   }
4585 
4586 // TODO(any): Use av1_dist_block to compute distortion
4587 #if CONFIG_HIGHBITDEPTH
4588   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
4589     rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
4590     aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
4591                              0, NULL, 0, bw, bh, xd->bd);
4592   } else {
4593     rec_buffer = (uint8_t *)rec_buffer16;
4594     aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
4595                       NULL, 0, bw, bh);
4596   }
4597 #else
4598   aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
4599                     0, bw, bh);
4600 #endif  // CONFIG_HIGHBITDEPTH
4601 
4602 #if DISABLE_TRELLISQ_SEARCH
4603   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
4604                   coeff_ctx, AV1_XFORM_QUANT_B);
4605 
4606 #else
4607   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
4608                   coeff_ctx, AV1_XFORM_QUANT_FP);
4609 
4610   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
4611   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
4612   const int buffer_length = tx_size_2d[tx_size];
4613   int64_t tmp_dist, tmp_sse;
4614 #if CONFIG_DIST_8X8
4615   int disable_early_skip =
4616       x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
4617       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
4618       x->tune_metric != AOM_TUNE_PSNR;
4619 #endif  // CONFIG_DIST_8X8
4620 
4621 #if CONFIG_HIGHBITDEPTH
4622   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
4623     tmp_dist =
4624         av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
4625   else
4626 #endif
4627     tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
4628 
4629   tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
4630 
4631 #if CONFIG_MRC_TX
4632   if (tx_type == MRC_DCT && !xd->mi[0]->mbmi.valid_mrc_mask) {
4633     av1_invalid_rd_stats(rd_stats);
4634     return;
4635   }
4636 #endif  // CONFIG_MRC_TX
4637   if (
4638 #if CONFIG_DIST_8X8
4639       disable_early_skip ||
4640 #endif
4641       RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
4642     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
4643                    a, l, 1);
4644   } else {
4645     rd_stats->rate += rd_stats->zero_rate;
4646     rd_stats->dist += tmp << 4;
4647     rd_stats->skip = 1;
4648     rd_stats->invalid_rate = 1;
4649     return;
4650   }
4651 #endif  // DISABLE_TRELLISQ_SEARCH
4652 
4653   const int eob = p->eobs[block];
4654 
4655   av1_inverse_transform_block(xd, dqcoeff,
4656 #if CONFIG_LGT_FROM_PRED
4657                               xd->mi[0]->mbmi.mode,
4658 #endif
4659 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
4660                               mrc_mask,
4661 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
4662                               tx_type, tx_size, rec_buffer, MAX_TX_SIZE, eob);
4663   if (eob > 0) {
4664 #if CONFIG_DIST_8X8
4665     if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) {
4666       // Save sub8x8 luma decoded pixels
4667       // since 8x8 luma decoded pixels are not available for daala-dist
4668       // after recursive split of BLOCK_8x8 is done.
4669       const int pred_stride = block_size_wide[plane_bsize];
4670       const int pred_idx = (blk_row * pred_stride + blk_col)
4671                            << tx_size_wide_log2[0];
4672       int16_t *decoded = &pd->pred[pred_idx];
4673       int i, j;
4674 
4675 #if CONFIG_HIGHBITDEPTH
4676       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
4677         for (j = 0; j < bh; j++)
4678           for (i = 0; i < bw; i++)
4679             decoded[j * pred_stride + i] =
4680                 CONVERT_TO_SHORTPTR(rec_buffer)[j * MAX_TX_SIZE + i];
4681       } else {
4682 #endif
4683         for (j = 0; j < bh; j++)
4684           for (i = 0; i < bw; i++)
4685             decoded[j * pred_stride + i] = rec_buffer[j * MAX_TX_SIZE + i];
4686 #if CONFIG_HIGHBITDEPTH
4687       }
4688 #endif  // CONFIG_HIGHBITDEPTH
4689     }
4690 #endif  // CONFIG_DIST_8X8
4691     tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE,
4692                      blk_row, blk_col, plane_bsize, txm_bsize);
4693   }
4694   rd_stats->dist += tmp * 16;
4695   txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
4696                                    tx_size, scan_order, a, l, 0);
4697   rd_stats->rate += txb_coeff_cost;
4698   rd_stats->skip &= (eob == 0);
4699 
4700 #if CONFIG_RD_DEBUG
4701   av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
4702                             txb_coeff_cost);
4703 #endif  // CONFIG_RD_DEBUG
4704 }
4705 
select_tx_block(const AV1_COMP * cpi,MACROBLOCK * x,int blk_row,int blk_col,int plane,int block,TX_SIZE tx_size,int depth,BLOCK_SIZE plane_bsize,ENTROPY_CONTEXT * ta,ENTROPY_CONTEXT * tl,TXFM_CONTEXT * tx_above,TXFM_CONTEXT * tx_left,RD_STATS * rd_stats,int64_t ref_best_rd,int * is_cost_valid)4706 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
4707                             int blk_col, int plane, int block, TX_SIZE tx_size,
4708                             int depth, BLOCK_SIZE plane_bsize,
4709                             ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
4710                             TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
4711                             RD_STATS *rd_stats, int64_t ref_best_rd,
4712                             int *is_cost_valid) {
4713   MACROBLOCKD *const xd = &x->e_mbd;
4714   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4715   struct macroblock_plane *const p = &x->plane[plane];
4716   struct macroblockd_plane *const pd = &xd->plane[plane];
4717   const int tx_row = blk_row >> (1 - pd->subsampling_y);
4718   const int tx_col = blk_col >> (1 - pd->subsampling_x);
4719   TX_SIZE(*const inter_tx_size)
4720   [MAX_MIB_SIZE] =
4721       (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
4722   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
4723   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
4724   const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
4725   int64_t this_rd = INT64_MAX;
4726   ENTROPY_CONTEXT *pta = ta + blk_col;
4727   ENTROPY_CONTEXT *ptl = tl + blk_row;
4728   int i;
4729   int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
4730                                    mbmi->sb_type, tx_size);
4731   int64_t sum_rd = INT64_MAX;
4732   int tmp_eob = 0;
4733   int zero_blk_rate;
4734   RD_STATS sum_rd_stats;
4735 #if CONFIG_TXK_SEL
4736   TX_TYPE best_tx_type = TX_TYPES;
4737   int txk_idx = (blk_row << 4) + blk_col;
4738 #endif
4739 #if CONFIG_RECT_TX_EXT
4740   TX_SIZE quarter_txsize = quarter_txsize_lookup[mbmi->sb_type];
4741   int check_qttx = is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
4742                    tx_size == max_txsize_rect_lookup[mbmi->sb_type] &&
4743                    quarter_txsize != tx_size;
4744   int is_qttx_picked = 0;
4745   int eobs_qttx[2] = { 0, 0 };
4746   int skip_qttx[2] = { 0, 0 };
4747   int block_offset_qttx = check_qttx
4748                               ? tx_size_wide_unit[quarter_txsize] *
4749                                     tx_size_high_unit[quarter_txsize]
4750                               : 0;
4751   int blk_row_offset, blk_col_offset;
4752   int is_wide_qttx =
4753       tx_size_wide_unit[quarter_txsize] > tx_size_high_unit[quarter_txsize];
4754   blk_row_offset = is_wide_qttx ? tx_size_high_unit[quarter_txsize] : 0;
4755   blk_col_offset = is_wide_qttx ? 0 : tx_size_wide_unit[quarter_txsize];
4756 #endif
4757 
4758   av1_init_rd_stats(&sum_rd_stats);
4759 
4760   assert(tx_size < TX_SIZES_ALL);
4761 
4762   if (ref_best_rd < 0) {
4763     *is_cost_valid = 0;
4764     return;
4765   }
4766 
4767   av1_init_rd_stats(rd_stats);
4768 
4769   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
4770 
4771 #if CONFIG_LV_MAP
4772   TX_SIZE txs_ctx = get_txsize_context(tx_size);
4773   TXB_CTX txb_ctx;
4774   get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
4775 
4776 #if LV_MAP_PROB
4777   zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(plane)]
4778                       .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
4779 #else
4780   zero_blk_rate =
4781       av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
4782 #endif  // LV_MAP_PROB
4783 #else
4784   TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
4785   int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
4786   zero_blk_rate =
4787       x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
4788 #endif
4789 
4790   rd_stats->ref_rdcost = ref_best_rd;
4791   rd_stats->zero_rate = zero_blk_rate;
4792   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
4793     inter_tx_size[0][0] = tx_size;
4794     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
4795                       plane_bsize, pta, ptl, rd_stats);
4796     if (rd_stats->rate == INT_MAX) return;
4797 
4798     if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
4799              RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
4800          rd_stats->skip == 1) &&
4801         !xd->lossless[mbmi->segment_id]) {
4802 #if CONFIG_RD_DEBUG
4803       av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
4804                                 zero_blk_rate - rd_stats->rate);
4805 #endif  // CONFIG_RD_DEBUG
4806       rd_stats->rate = zero_blk_rate;
4807       rd_stats->dist = rd_stats->sse;
4808       rd_stats->skip = 1;
4809       x->blk_skip[plane][blk_row * bw + blk_col] = 1;
4810       p->eobs[block] = 0;
4811 #if CONFIG_TXK_SEL
4812       mbmi->txk_type[txk_idx] = DCT_DCT;
4813 #endif
4814     } else {
4815       x->blk_skip[plane][blk_row * bw + blk_col] = 0;
4816       rd_stats->skip = 0;
4817     }
4818 
4819     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
4820       rd_stats->rate +=
4821           av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
4822 #if CONFIG_RECT_TX_EXT
4823     if (check_qttx) {
4824       assert(blk_row == 0 && blk_col == 0);
4825       rd_stats->rate += av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 0);
4826     }
4827 #endif
4828     this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
4829 #if CONFIG_LV_MAP
4830     tmp_eob = p->txb_entropy_ctx[block];
4831 #else
4832     tmp_eob = p->eobs[block];
4833 #endif
4834 
4835 #if CONFIG_TXK_SEL
4836     best_tx_type = mbmi->txk_type[txk_idx];
4837 #endif
4838 
4839 #if CONFIG_RECT_TX_EXT
4840     if (check_qttx) {
4841       assert(blk_row == 0 && blk_col == 0 && block == 0 && plane == 0);
4842 
4843       RD_STATS rd_stats_tmp, rd_stats_qttx;
4844       int64_t rd_qttx;
4845 
4846       av1_init_rd_stats(&rd_stats_qttx);
4847       av1_init_rd_stats(&rd_stats_tmp);
4848 
4849       av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
4850                         pta, ptl, &rd_stats_qttx);
4851       if (rd_stats->rate == INT_MAX) return;
4852 
4853       tx_size_ctx = txsize_sqr_map[quarter_txsize];
4854       coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
4855       zero_blk_rate =
4856           x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
4857       if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
4858                RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
4859            rd_stats_qttx.skip == 1) &&
4860           !xd->lossless[mbmi->segment_id]) {
4861 #if CONFIG_RD_DEBUG
4862         av1_update_txb_coeff_cost(&rd_stats_qttx, plane, quarter_txsize, 0, 0,
4863                                   zero_blk_rate - rd_stats_qttx.rate);
4864 #endif  // CONFIG_RD_DEBUG
4865         rd_stats_qttx.rate = zero_blk_rate;
4866         rd_stats_qttx.dist = rd_stats_qttx.sse;
4867         rd_stats_qttx.skip = 1;
4868         x->blk_skip[plane][blk_row * bw + blk_col] = 1;
4869         skip_qttx[0] = 1;
4870         p->eobs[block] = 0;
4871       } else {
4872         x->blk_skip[plane][blk_row * bw + blk_col] = 0;
4873         skip_qttx[0] = 0;
4874         rd_stats->skip = 0;
4875       }
4876 
4877       // Second tx block
4878       av1_tx_block_rd_b(cpi, x, quarter_txsize, blk_row_offset, blk_col_offset,
4879                         plane, block_offset_qttx, plane_bsize, pta, ptl,
4880                         &rd_stats_tmp);
4881 
4882       if (rd_stats->rate == INT_MAX) return;
4883 
4884 #if !CONFIG_PVQ
4885       av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
4886 #endif  // !CONFIG_PVQ
4887       coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
4888                                       ptl + blk_row_offset);
4889       zero_blk_rate =
4890           x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
4891       if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
4892                RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
4893            rd_stats_tmp.skip == 1) &&
4894           !xd->lossless[mbmi->segment_id]) {
4895 #if CONFIG_RD_DEBUG
4896         av1_update_txb_coeff_cost(&rd_stats_tmp, plane, quarter_txsize, 0, 0,
4897                                   zero_blk_rate - rd_stats_tmp.rate);
4898 #endif  // CONFIG_RD_DEBUG
4899         rd_stats_tmp.rate = zero_blk_rate;
4900         rd_stats_tmp.dist = rd_stats_tmp.sse;
4901         rd_stats_tmp.skip = 1;
4902         x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 1;
4903         skip_qttx[1] = 1;
4904         p->eobs[block_offset_qttx] = 0;
4905       } else {
4906         x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 0;
4907         skip_qttx[1] = 0;
4908         rd_stats_tmp.skip = 0;
4909       }
4910 
4911       av1_merge_rd_stats(&rd_stats_qttx, &rd_stats_tmp);
4912 
4913       if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
4914         rd_stats_qttx.rate +=
4915             av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
4916       }
4917       rd_stats_qttx.rate +=
4918           av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 1);
4919       rd_qttx = RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist);
4920 #if CONFIG_LV_MAP
4921       eobs_qttx[0] = p->txb_entropy_ctx[0];
4922       eobs_qttx[1] = p->txb_entropy_ctx[block_offset_qttx];
4923 #else
4924       eobs_qttx[0] = p->eobs[0];
4925       eobs_qttx[1] = p->eobs[block_offset_qttx];
4926 #endif
4927       if (rd_qttx < this_rd) {
4928         is_qttx_picked = 1;
4929         this_rd = rd_qttx;
4930         rd_stats->rate = rd_stats_qttx.rate;
4931         rd_stats->dist = rd_stats_qttx.dist;
4932         rd_stats->sse = rd_stats_qttx.sse;
4933         rd_stats->skip = rd_stats_qttx.skip;
4934         rd_stats->rdcost = rd_stats_qttx.rdcost;
4935       }
4936       av1_get_entropy_contexts(plane_bsize, 0, pd, ta, tl);
4937     }
4938 #endif
4939   }
4940 
4941   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH
4942 #if CONFIG_MRC_TX
4943       // If the tx type we are trying is MRC_DCT, we cannot partition the
4944       // transform into anything smaller than TX_32X32
4945       && mbmi->tx_type != MRC_DCT
4946 #endif  // CONFIG_MRC_TX
4947       ) {
4948     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
4949     const int bsl = tx_size_wide_unit[sub_txs];
4950     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
4951     RD_STATS this_rd_stats;
4952     int this_cost_valid = 1;
4953     int64_t tmp_rd = 0;
4954 #if CONFIG_DIST_8X8
4955     int sub8x8_eob[4];
4956 #endif
4957     sum_rd_stats.rate =
4958         av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
4959 
4960     assert(tx_size < TX_SIZES_ALL);
4961 
4962     ref_best_rd = AOMMIN(this_rd, ref_best_rd);
4963 
4964     for (i = 0; i < 4 && this_cost_valid; ++i) {
4965       int offsetr = blk_row + (i >> 1) * bsl;
4966       int offsetc = blk_col + (i & 0x01) * bsl;
4967 
4968       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
4969 
4970       select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
4971                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
4972                       &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
4973 #if CONFIG_DIST_8X8
4974       if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) {
4975         sub8x8_eob[i] = p->eobs[block];
4976       }
4977 #endif  // CONFIG_DIST_8X8
4978       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
4979 
4980       tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
4981 #if CONFIG_DIST_8X8
4982       if (!x->using_dist_8x8)
4983 #endif
4984         if (this_rd < tmp_rd) break;
4985       block += sub_step;
4986     }
4987 #if CONFIG_DIST_8X8
4988     if (x->using_dist_8x8 && this_cost_valid && plane == 0 &&
4989         tx_size == TX_8X8) {
4990       const int src_stride = p->src.stride;
4991       const int dst_stride = pd->dst.stride;
4992 
4993       const uint8_t *src =
4994           &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
4995       const uint8_t *dst =
4996           &pd->dst
4997                .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
4998 
4999       int64_t dist_8x8;
5000       int qindex = x->qindex;
5001       const int pred_stride = block_size_wide[plane_bsize];
5002       const int pred_idx = (blk_row * pred_stride + blk_col)
5003                            << tx_size_wide_log2[0];
5004       int16_t *pred = &pd->pred[pred_idx];
5005       int j;
5006       int row, col;
5007 
5008 #if CONFIG_HIGHBITDEPTH
5009       uint8_t *pred8;
5010       DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
5011 #else
5012       DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
5013 #endif  // CONFIG_HIGHBITDEPTH
5014 
5015       dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
5016                               BLOCK_8X8, 8, 8, 8, 8, qindex) *
5017                  16;
5018       sum_rd_stats.sse = dist_8x8;
5019 
5020 #if CONFIG_HIGHBITDEPTH
5021       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
5022         pred8 = CONVERT_TO_BYTEPTR(pred8_16);
5023       else
5024         pred8 = (uint8_t *)pred8_16;
5025 #endif
5026 
5027 #if CONFIG_HIGHBITDEPTH
5028       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
5029         for (row = 0; row < 2; ++row) {
5030           for (col = 0; col < 2; ++col) {
5031             int idx = row * 2 + col;
5032             int eob = sub8x8_eob[idx];
5033 
5034             if (eob > 0) {
5035               for (j = 0; j < 4; j++)
5036                 for (i = 0; i < 4; i++)
5037                   CONVERT_TO_SHORTPTR(pred8)
5038                   [(row * 4 + j) * 8 + 4 * col + i] =
5039                       pred[(row * 4 + j) * pred_stride + 4 * col + i];
5040             } else {
5041               for (j = 0; j < 4; j++)
5042                 for (i = 0; i < 4; i++)
5043                   CONVERT_TO_SHORTPTR(pred8)
5044                   [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
5045                       dst)[(row * 4 + j) * dst_stride + 4 * col + i];
5046             }
5047           }
5048         }
5049       } else {
5050 #endif
5051         for (row = 0; row < 2; ++row) {
5052           for (col = 0; col < 2; ++col) {
5053             int idx = row * 2 + col;
5054             int eob = sub8x8_eob[idx];
5055 
5056             if (eob > 0) {
5057               for (j = 0; j < 4; j++)
5058                 for (i = 0; i < 4; i++)
5059                   pred8[(row * 4 + j) * 8 + 4 * col + i] =
5060                       (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
5061             } else {
5062               for (j = 0; j < 4; j++)
5063                 for (i = 0; i < 4; i++)
5064                   pred8[(row * 4 + j) * 8 + 4 * col + i] =
5065                       dst[(row * 4 + j) * dst_stride + 4 * col + i];
5066             }
5067           }
5068         }
5069 #if CONFIG_HIGHBITDEPTH
5070       }
5071 #endif  // CONFIG_HIGHBITDEPTH
5072       dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
5073                               8, 8, 8, qindex) *
5074                  16;
5075       sum_rd_stats.dist = dist_8x8;
5076       tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
5077     }
5078 #endif  // CONFIG_DIST_8X8
5079     if (this_cost_valid) sum_rd = tmp_rd;
5080   }
5081 
5082   if (this_rd < sum_rd) {
5083     int idx, idy;
5084 #if CONFIG_RECT_TX_EXT
5085     TX_SIZE tx_size_selected = is_qttx_picked ? quarter_txsize : tx_size;
5086 #else
5087     TX_SIZE tx_size_selected = tx_size;
5088 #endif
5089 
5090 #if CONFIG_RECT_TX_EXT
5091     if (is_qttx_picked) {
5092       assert(blk_row == 0 && blk_col == 0 && plane == 0);
5093 #if CONFIG_LV_MAP
5094       p->txb_entropy_ctx[0] = eobs_qttx[0];
5095       p->txb_entropy_ctx[block_offset_qttx] = eobs_qttx[1];
5096 #else
5097       p->eobs[0] = eobs_qttx[0];
5098       p->eobs[block_offset_qttx] = eobs_qttx[1];
5099 #endif
5100     } else {
5101 #endif
5102 #if CONFIG_LV_MAP
5103       p->txb_entropy_ctx[block] = tmp_eob;
5104 #else
5105     p->eobs[block] = tmp_eob;
5106 #endif
5107 #if CONFIG_RECT_TX_EXT
5108     }
5109 #endif
5110 
5111 #if !CONFIG_PVQ
5112     av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
5113 #if CONFIG_RECT_TX_EXT
5114     if (is_qttx_picked)
5115       av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
5116                           pta + blk_col_offset, ptl + blk_row_offset);
5117 #endif  // CONFIG_RECT_TX_EXT
5118 #endif  // !CONFIG_PVQ
5119 
5120     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
5121                           tx_size);
5122     inter_tx_size[0][0] = tx_size_selected;
5123     for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
5124       for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
5125         inter_tx_size[idy][idx] = tx_size_selected;
5126     mbmi->tx_size = tx_size_selected;
5127 #if CONFIG_TXK_SEL
5128     mbmi->txk_type[txk_idx] = best_tx_type;
5129 #endif
5130     if (this_rd == INT64_MAX) *is_cost_valid = 0;
5131 #if CONFIG_RECT_TX_EXT
5132     if (is_qttx_picked) {
5133       x->blk_skip[plane][0] = skip_qttx[0];
5134       x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = skip_qttx[1];
5135     } else {
5136 #endif
5137       x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
5138 #if CONFIG_RECT_TX_EXT
5139     }
5140 #endif
5141   } else {
5142     *rd_stats = sum_rd_stats;
5143     if (sum_rd == INT64_MAX) *is_cost_valid = 0;
5144   }
5145 }
5146 
inter_block_yrd(const AV1_COMP * cpi,MACROBLOCK * x,RD_STATS * rd_stats,BLOCK_SIZE bsize,int64_t ref_best_rd)5147 static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
5148                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
5149                             int64_t ref_best_rd) {
5150   MACROBLOCKD *const xd = &x->e_mbd;
5151   int is_cost_valid = 1;
5152   int64_t this_rd = 0;
5153 
5154   if (ref_best_rd < 0) is_cost_valid = 0;
5155 
5156   av1_init_rd_stats(rd_stats);
5157 
5158   if (is_cost_valid) {
5159     const struct macroblockd_plane *const pd = &xd->plane[0];
5160     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
5161     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
5162     const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
5163     const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
5164     const int bh = tx_size_high_unit[max_tx_size];
5165     const int bw = tx_size_wide_unit[max_tx_size];
5166     int idx, idy;
5167     int block = 0;
5168     int init_depth =
5169         (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
5170     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
5171     ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
5172     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
5173     TXFM_CONTEXT tx_above[MAX_MIB_SIZE * 2];
5174     TXFM_CONTEXT tx_left[MAX_MIB_SIZE * 2];
5175 
5176     RD_STATS pn_rd_stats;
5177     av1_init_rd_stats(&pn_rd_stats);
5178 
5179     av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
5180     memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
5181     memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
5182 
5183     for (idy = 0; idy < mi_height; idy += bh) {
5184       for (idx = 0; idx < mi_width; idx += bw) {
5185         select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth,
5186                         plane_bsize, ctxa, ctxl, tx_above, tx_left,
5187                         &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid);
5188         if (pn_rd_stats.rate == INT_MAX) {
5189           av1_invalid_rd_stats(rd_stats);
5190           return;
5191         }
5192         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
5193         this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
5194                           RDCOST(x->rdmult, 0, pn_rd_stats.sse));
5195         block += step;
5196       }
5197     }
5198   }
5199 
5200   this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
5201                    RDCOST(x->rdmult, 0, rd_stats->sse));
5202   if (this_rd > ref_best_rd) is_cost_valid = 0;
5203 
5204   if (!is_cost_valid) {
5205     // reset cost value
5206     av1_invalid_rd_stats(rd_stats);
5207   }
5208 }
5209 
select_tx_size_fix_type(const AV1_COMP * cpi,MACROBLOCK * x,RD_STATS * rd_stats,BLOCK_SIZE bsize,int64_t ref_best_rd,TX_TYPE tx_type)5210 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
5211                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
5212                                        int64_t ref_best_rd, TX_TYPE tx_type) {
5213   const AV1_COMMON *const cm = &cpi->common;
5214   MACROBLOCKD *const xd = &x->e_mbd;
5215   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5216   const int is_inter = is_inter_block(mbmi);
5217   aom_prob skip_prob = av1_get_skip_prob(cm, xd);
5218   int s0 = av1_cost_bit(skip_prob, 0);
5219   int s1 = av1_cost_bit(skip_prob, 1);
5220   int64_t rd;
5221   int row, col;
5222   const int max_blocks_high = max_block_high(xd, bsize, 0);
5223   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
5224 
5225   mbmi->tx_type = tx_type;
5226   inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
5227   mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
5228 
5229   if (rd_stats->rate == INT_MAX) return INT64_MAX;
5230 
5231   for (row = 0; row < max_blocks_high / 2; ++row)
5232     for (col = 0; col < max_blocks_wide / 2; ++col)
5233       mbmi->min_tx_size = AOMMIN(
5234           mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
5235 
5236 #if !CONFIG_TXK_SEL
5237 #if CONFIG_EXT_TX
5238   if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
5239                        cm->reduced_tx_set_used) > 1 &&
5240       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
5241     const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
5242                                           cm->reduced_tx_set_used);
5243 #if CONFIG_LGT_FROM_PRED
5244     if (is_lgt_allowed(mbmi->mode, mbmi->min_tx_size)) {
5245       if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
5246           ALLOW_INTRA_EXT_TX)
5247         rd_stats->rate += x->intra_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]]
5248                                            [mbmi->mode][mbmi->use_lgt];
5249       if (LGT_FROM_PRED_INTER && is_inter && ext_tx_set > 0)
5250         rd_stats->rate +=
5251             x->inter_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]][mbmi->use_lgt];
5252     }
5253     if (!mbmi->use_lgt) {
5254 #endif  // CONFIG_LGT_FROM_PRED
5255       if (is_inter) {
5256         if (ext_tx_set > 0)
5257           rd_stats->rate +=
5258               x->inter_tx_type_costs[ext_tx_set]
5259                                     [txsize_sqr_map[mbmi->min_tx_size]]
5260                                     [mbmi->tx_type];
5261       } else {
5262         if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
5263           rd_stats->rate +=
5264               x->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
5265                                     [mbmi->tx_type];
5266       }
5267     }
5268 #if CONFIG_LGT_FROM_PRED
5269   }
5270 #endif
5271 #else
5272   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
5273     rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
5274 #endif  // CONFIG_EXT_TX
5275 #endif  // CONFIG_TXK_SEL
5276 
5277   if (rd_stats->skip)
5278     rd = RDCOST(x->rdmult, s1, rd_stats->sse);
5279   else
5280     rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
5281 
5282   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
5283       !(rd_stats->skip))
5284     rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
5285 
5286   return rd;
5287 }
5288 
get_block_residue_hash(MACROBLOCK * x,BLOCK_SIZE bsize)5289 static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
5290   const int rows = block_size_high[bsize];
5291   const int cols = block_size_wide[bsize];
5292   const int diff_stride = cols;
5293   const struct macroblock_plane *const p = &x->plane[0];
5294   const int16_t *diff = &p->src_diff[0];
5295   uint8_t hash_data[MAX_SB_SQUARE];
5296   for (int r = 0; r < rows; ++r) {
5297     for (int c = 0; c < cols; ++c) {
5298       hash_data[cols * r + c] = clip_pixel(diff[c] + 128);
5299     }
5300     diff += diff_stride;
5301   }
5302   return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data,
5303                             rows * cols)
5304           << 7) +
5305          bsize;
5306 }
5307 
save_tx_rd_info(int n4,uint32_t hash,const MACROBLOCK * const x,const RD_STATS * const rd_stats,TX_RD_INFO * const tx_rd_info)5308 static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
5309                             const RD_STATS *const rd_stats,
5310                             TX_RD_INFO *const tx_rd_info) {
5311   const MACROBLOCKD *const xd = &x->e_mbd;
5312   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5313   tx_rd_info->hash_value = hash;
5314   tx_rd_info->tx_type = mbmi->tx_type;
5315   tx_rd_info->tx_size = mbmi->tx_size;
5316 #if CONFIG_VAR_TX
5317   tx_rd_info->min_tx_size = mbmi->min_tx_size;
5318   memcpy(tx_rd_info->blk_skip, x->blk_skip[0],
5319          sizeof(tx_rd_info->blk_skip[0]) * n4);
5320   for (int idy = 0; idy < xd->n8_h; ++idy)
5321     for (int idx = 0; idx < xd->n8_w; ++idx)
5322       tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
5323 #endif  // CONFIG_VAR_TX
5324 #if CONFIG_TXK_SEL
5325   av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
5326 #endif  // CONFIG_TXK_SEL
5327   tx_rd_info->rd_stats = *rd_stats;
5328 }
5329 
fetch_tx_rd_info(int n4,const TX_RD_INFO * const tx_rd_info,RD_STATS * const rd_stats,MACROBLOCK * const x)5330 static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
5331                              RD_STATS *const rd_stats, MACROBLOCK *const x) {
5332   MACROBLOCKD *const xd = &x->e_mbd;
5333   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5334   mbmi->tx_type = tx_rd_info->tx_type;
5335   mbmi->tx_size = tx_rd_info->tx_size;
5336 #if CONFIG_VAR_TX
5337   mbmi->min_tx_size = tx_rd_info->min_tx_size;
5338   memcpy(x->blk_skip[0], tx_rd_info->blk_skip,
5339          sizeof(tx_rd_info->blk_skip[0]) * n4);
5340   for (int idy = 0; idy < xd->n8_h; ++idy)
5341     for (int idx = 0; idx < xd->n8_w; ++idx)
5342       mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx];
5343 #endif  // CONFIG_VAR_TX
5344 #if CONFIG_TXK_SEL
5345   av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
5346 #endif  // CONFIG_TXK_SEL
5347   *rd_stats = tx_rd_info->rd_stats;
5348 }
5349 
5350 // Uses simple features on top of DCT coefficients to quickly predict
5351 // whether optimal RD decision is to skip encoding the residual.
predict_skip_flag_8bit(const MACROBLOCK * x,BLOCK_SIZE bsize)5352 static int predict_skip_flag_8bit(const MACROBLOCK *x, BLOCK_SIZE bsize) {
5353   if (bsize > BLOCK_16X16) return 0;
5354   // Tuned for target false-positive rate of 5% for all block sizes:
5355   const uint32_t threshold_table[] = { 50, 50, 50, 55, 47, 47, 53, 22, 22, 37 };
5356   const struct macroblock_plane *const p = &x->plane[0];
5357   const int bw = block_size_wide[bsize];
5358   const int bh = block_size_high[bsize];
5359   tran_low_t DCT_coefs[32 * 32];
5360   TxfmParam param;
5361   param.tx_type = DCT_DCT;
5362 #if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
5363   param.tx_size = max_txsize_rect_lookup[bsize];
5364 #else
5365   param.tx_size = max_txsize_lookup[bsize];
5366 #endif
5367   param.bd = 8;
5368   param.lossless = 0;
5369   av1_fwd_txfm(p->src_diff, DCT_coefs, bw, &param);
5370 
5371   uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, AOM_BITS_8);
5372   uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, AOM_BITS_8);
5373   uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc;
5374   for (int i = 1; i < bw * bh; i++) {
5375     uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac;
5376     if (cur_quantized_coef > max_quantized_coef)
5377       max_quantized_coef = cur_quantized_coef;
5378   }
5379 
5380   return max_quantized_coef < threshold_table[AOMMAX(bsize - BLOCK_4X4, 0)];
5381 }
5382 
5383 // Used to set proper context for early termination with skip = 1.
set_skip_flag(const AV1_COMP * cpi,MACROBLOCK * x,RD_STATS * rd_stats,int bsize)5384 static void set_skip_flag(const AV1_COMP *cpi, MACROBLOCK *x,
5385                           RD_STATS *rd_stats, int bsize) {
5386   MACROBLOCKD *const xd = &x->e_mbd;
5387   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5388   const int n4 = bsize_to_num_blk(bsize);
5389 #if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
5390   const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
5391 #else
5392   const TX_SIZE tx_size = max_txsize_lookup[bsize];
5393 #endif
5394   mbmi->tx_type = DCT_DCT;
5395   for (int idy = 0; idy < xd->n8_h; ++idy)
5396     for (int idx = 0; idx < xd->n8_w; ++idx)
5397       mbmi->inter_tx_size[idy][idx] = tx_size;
5398   mbmi->tx_size = tx_size;
5399   mbmi->min_tx_size = get_min_tx_size(tx_size);
5400   memset(x->blk_skip[0], 1, sizeof(uint8_t) * n4);
5401   rd_stats->skip = 1;
5402 
5403   // Rate.
5404   const int tx_size_ctx = txsize_sqr_map[tx_size];
5405   ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
5406   ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
5407   av1_get_entropy_contexts(bsize, 0, &xd->plane[0], ctxa, ctxl);
5408   int coeff_ctx = get_entropy_context(tx_size, ctxa, ctxl);
5409   int rate = x->token_head_costs[tx_size_ctx][PLANE_TYPE_Y][1][0][coeff_ctx][0];
5410   if (tx_size > TX_4X4) {
5411     int ctx = txfm_partition_context(
5412         xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
5413     rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
5414   }
5415 #if !CONFIG_TXK_SEL
5416 #if CONFIG_EXT_TX
5417   const AV1_COMMON *cm = &cpi->common;
5418   const int ext_tx_set = get_ext_tx_set(max_txsize_lookup[bsize], bsize, 1,
5419                                         cm->reduced_tx_set_used);
5420   if (get_ext_tx_types(mbmi->min_tx_size, bsize, 1, cm->reduced_tx_set_used) >
5421           1 &&
5422       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
5423     if (ext_tx_set > 0)
5424       rate +=
5425           x->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->min_tx_size]]
5426                                 [mbmi->tx_type];
5427   }
5428 #else
5429   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
5430     rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
5431 #endif  // CONFIG_EXT_TX
5432 #endif  // CONFIG_TXK_SEL
5433   rd_stats->rate = rate;
5434 
5435   // Distortion.
5436   int64_t tmp = pixel_diff_dist(x, 0, x->plane[0].src_diff,
5437                                 block_size_wide[bsize], 0, 0, bsize, bsize);
5438 #if CONFIG_HIGHBITDEPTH
5439   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
5440     tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
5441 #endif  // CONFIG_HIGHBITDEPTH
5442   rd_stats->dist = rd_stats->sse = (tmp << 4);
5443 }
5444 
select_tx_type_yrd(const AV1_COMP * cpi,MACROBLOCK * x,RD_STATS * rd_stats,BLOCK_SIZE bsize,int64_t ref_best_rd)5445 static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
5446                                RD_STATS *rd_stats, BLOCK_SIZE bsize,
5447                                int64_t ref_best_rd) {
5448   const AV1_COMMON *cm = &cpi->common;
5449   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
5450   MACROBLOCKD *const xd = &x->e_mbd;
5451   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5452   int64_t rd = INT64_MAX;
5453   int64_t best_rd = INT64_MAX;
5454   TX_TYPE tx_type, best_tx_type = DCT_DCT;
5455   const int is_inter = is_inter_block(mbmi);
5456   TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
5457   TX_SIZE best_tx = max_txsize_lookup[bsize];
5458   TX_SIZE best_min_tx_size = TX_SIZES_ALL;
5459   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
5460   TX_TYPE txk_start = DCT_DCT;
5461 #if CONFIG_TXK_SEL
5462   TX_TYPE txk_end = DCT_DCT + 1;
5463 #else
5464   TX_TYPE txk_end = TX_TYPES;
5465 #endif
5466   const int n4 = bsize_to_num_blk(bsize);
5467   int idx, idy;
5468   int prune = 0;
5469 #if CONFIG_EXT_TX
5470   const TxSetType tx_set_type = get_ext_tx_set_type(
5471       max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
5472   const int ext_tx_set =
5473       get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
5474 #endif  // CONFIG_EXT_TX
5475 
5476   av1_invalid_rd_stats(rd_stats);
5477 
5478 #if CONFIG_LGT_FROM_PRED
5479   mbmi->use_lgt = 0;
5480   int search_lgt = is_inter
5481                        ? LGT_FROM_PRED_INTER &&
5482                              (!cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
5483                        : LGT_FROM_PRED_INTRA && ALLOW_INTRA_EXT_TX;
5484 #endif  // CONFIG_LGT_FROM_PRED
5485 
5486   const uint32_t hash = get_block_residue_hash(x, bsize);
5487   TX_RD_RECORD *tx_rd_record = &x->tx_rd_record;
5488 
5489   if (ref_best_rd != INT64_MAX) {
5490     for (int i = 0; i < tx_rd_record->num; ++i) {
5491       const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
5492       // If there is a match in the tx_rd_record, fetch the RD decision and
5493       // terminate early.
5494       if (tx_rd_record->tx_rd_info[index].hash_value == hash) {
5495         TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index];
5496         fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
5497         return;
5498       }
5499     }
5500   }
5501 
5502 // If we predict that skip is the optimal RD decision - set the respective
5503 // context and terminate early.
5504 #if CONFIG_HIGHBITDEPTH
5505   if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
5506 #endif  // CONFIG_HIGHBITDEPTH
5507   {
5508     if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
5509         predict_skip_flag_8bit(x, bsize)) {
5510       set_skip_flag(cpi, x, rd_stats, bsize);
5511       return;
5512     }
5513   }
5514 
5515   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
5516 #if CONFIG_EXT_TX
5517     prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
5518 #else
5519     prune = prune_tx_types(cpi, bsize, x, xd, 0);
5520 #endif  // CONFIG_EXT_TX
5521 
5522   int found = 0;
5523 
5524   for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
5525     RD_STATS this_rd_stats;
5526     av1_init_rd_stats(&this_rd_stats);
5527 #if CONFIG_MRC_TX
5528     // MRC_DCT only implemented for TX_32X32 so only include this tx in
5529     // the search for TX_32X32
5530     if (tx_type == MRC_DCT &&
5531         (max_tx_size != TX_32X32 || (is_inter && !USE_MRC_INTER) ||
5532          (!is_inter && !USE_MRC_INTRA)))
5533       continue;
5534 #endif  // CONFIG_MRC_TX
5535 #if CONFIG_EXT_TX
5536     if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
5537     if (is_inter) {
5538       if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
5539         if (!do_tx_type_search(tx_type, prune)) continue;
5540       }
5541     } else {
5542       if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
5543         if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
5544       }
5545     }
5546 #else   // CONFIG_EXT_TX
5547     if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
5548         !do_tx_type_search(tx_type, prune))
5549       continue;
5550 #endif  // CONFIG_EXT_TX
5551     if (is_inter && x->use_default_inter_tx_type &&
5552         tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
5553       continue;
5554 
5555     if (xd->lossless[mbmi->segment_id])
5556       if (tx_type != DCT_DCT) continue;
5557 
5558     rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
5559                                  tx_type);
5560     ref_best_rd = AOMMIN(rd, ref_best_rd);
5561     if (rd < best_rd) {
5562       best_rd = rd;
5563       *rd_stats = this_rd_stats;
5564       best_tx_type = mbmi->tx_type;
5565       best_tx = mbmi->tx_size;
5566       best_min_tx_size = mbmi->min_tx_size;
5567       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
5568       found = 1;
5569       for (idy = 0; idy < xd->n8_h; ++idy)
5570         for (idx = 0; idx < xd->n8_w; ++idx)
5571           best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
5572     }
5573   }
5574 
5575   // We should always find at least one candidate unless ref_best_rd is less
5576   // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
5577   // might have failed to find something better)
5578   assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
5579   if (!found) return;
5580 
5581 #if CONFIG_LGT_FROM_PRED
5582   if (search_lgt && is_lgt_allowed(mbmi->mode, max_tx_size) &&
5583       !cm->reduced_tx_set_used) {
5584     RD_STATS this_rd_stats;
5585     mbmi->use_lgt = 1;
5586     rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, 0);
5587     if (rd < best_rd) {
5588       best_rd = rd;
5589       *rd_stats = this_rd_stats;
5590       best_tx = mbmi->tx_size;
5591       best_min_tx_size = mbmi->min_tx_size;
5592       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
5593       for (idy = 0; idy < xd->n8_h; ++idy)
5594         for (idx = 0; idx < xd->n8_w; ++idx)
5595           best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
5596     } else {
5597       mbmi->use_lgt = 0;
5598     }
5599   }
5600 #endif  // CONFIG_LGT_FROM_PRED
5601   // We found a candidate transform to use. Copy our results from the "best"
5602   // array into mbmi.
5603   mbmi->tx_type = best_tx_type;
5604   for (idy = 0; idy < xd->n8_h; ++idy)
5605     for (idx = 0; idx < xd->n8_w; ++idx)
5606       mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
5607   mbmi->tx_size = best_tx;
5608   mbmi->min_tx_size = best_min_tx_size;
5609   memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
5610 
5611   // Save the RD search results into tx_rd_record.
5612   int index;
5613   if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
5614     index =
5615         (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
5616     ++tx_rd_record->num;
5617   } else {
5618     index = tx_rd_record->index_start;
5619     tx_rd_record->index_start =
5620         (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
5621   }
5622   save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]);
5623 }
5624 
tx_block_rd(const AV1_COMP * cpi,MACROBLOCK * x,int blk_row,int blk_col,int plane,int block,TX_SIZE tx_size,BLOCK_SIZE plane_bsize,ENTROPY_CONTEXT * above_ctx,ENTROPY_CONTEXT * left_ctx,RD_STATS * rd_stats)5625 static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
5626                         int blk_col, int plane, int block, TX_SIZE tx_size,
5627                         BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
5628                         ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
5629   MACROBLOCKD *const xd = &x->e_mbd;
5630   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5631   struct macroblockd_plane *const pd = &xd->plane[plane];
5632   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
5633   const int tx_row = blk_row >> (1 - pd->subsampling_y);
5634   const int tx_col = blk_col >> (1 - pd->subsampling_x);
5635   TX_SIZE plane_tx_size;
5636   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
5637   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
5638 
5639   assert(tx_size < TX_SIZES_ALL);
5640 
5641   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
5642 
5643   plane_tx_size =
5644       plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
5645             : mbmi->inter_tx_size[tx_row][tx_col];
5646 
5647   if (tx_size == plane_tx_size) {
5648     ENTROPY_CONTEXT *ta = above_ctx + blk_col;
5649     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
5650     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
5651                       plane_bsize, ta, tl, rd_stats);
5652 #if !CONFIG_PVQ
5653     av1_set_txb_context(x, plane, block, tx_size, ta, tl);
5654 #endif  // !CONFIG_PVQ
5655   } else {
5656     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
5657     const int bsl = tx_size_wide_unit[sub_txs];
5658     int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
5659     int i;
5660 
5661     assert(bsl > 0);
5662 
5663     for (i = 0; i < 4; ++i) {
5664       int offsetr = blk_row + (i >> 1) * bsl;
5665       int offsetc = blk_col + (i & 0x01) * bsl;
5666 
5667       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
5668 
5669       tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
5670                   above_ctx, left_ctx, rd_stats);
5671       block += step;
5672     }
5673   }
5674 }
5675 
5676 // Return value 0: early termination triggered, no valid rd cost available;
5677 //              1: rd cost values are valid.
inter_block_uvrd(const AV1_COMP * cpi,MACROBLOCK * x,RD_STATS * rd_stats,BLOCK_SIZE bsize,int64_t ref_best_rd)5678 static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
5679                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
5680                             int64_t ref_best_rd) {
5681   MACROBLOCKD *const xd = &x->e_mbd;
5682   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5683   int plane;
5684   int is_cost_valid = 1;
5685   int64_t this_rd;
5686 
5687   if (ref_best_rd < 0) is_cost_valid = 0;
5688 
5689   av1_init_rd_stats(rd_stats);
5690 
5691 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
5692   if (x->skip_chroma_rd) return is_cost_valid;
5693   bsize = scale_chroma_bsize(mbmi->sb_type, xd->plane[1].subsampling_x,
5694                              xd->plane[1].subsampling_y);
5695 #endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
5696 
5697 #if CONFIG_EXT_TX && CONFIG_RECT_TX
5698   if (is_rect_tx(mbmi->tx_size)) {
5699     return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd);
5700   }
5701 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
5702 
5703   if (is_inter_block(mbmi) && is_cost_valid) {
5704     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
5705       av1_subtract_plane(x, bsize, plane);
5706   }
5707 
5708   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
5709     const struct macroblockd_plane *const pd = &xd->plane[plane];
5710     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
5711     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
5712     const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
5713     const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
5714     const int bh = tx_size_high_unit[max_tx_size];
5715     const int bw = tx_size_wide_unit[max_tx_size];
5716     int idx, idy;
5717     int block = 0;
5718     const int step = bh * bw;
5719     ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
5720     ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
5721     RD_STATS pn_rd_stats;
5722     av1_init_rd_stats(&pn_rd_stats);
5723 
5724     av1_get_entropy_contexts(bsize, 0, pd, ta, tl);
5725 
5726     for (idy = 0; idy < mi_height; idy += bh) {
5727       for (idx = 0; idx < mi_width; idx += bw) {
5728         tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
5729                     ta, tl, &pn_rd_stats);
5730         block += step;
5731       }
5732     }
5733 
5734     if (pn_rd_stats.rate == INT_MAX) {
5735       is_cost_valid = 0;
5736       break;
5737     }
5738 
5739     av1_merge_rd_stats(rd_stats, &pn_rd_stats);
5740 
5741     this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
5742                      RDCOST(x->rdmult, 0, rd_stats->sse));
5743 
5744     if (this_rd > ref_best_rd) {
5745       is_cost_valid = 0;
5746       break;
5747     }
5748   }
5749 
5750   if (!is_cost_valid) {
5751     // reset cost value
5752     av1_invalid_rd_stats(rd_stats);
5753   }
5754 
5755   return is_cost_valid;
5756 }
5757 #endif  // CONFIG_VAR_TX
5758 
rd_pick_palette_intra_sbuv(const AV1_COMP * const cpi,MACROBLOCK * x,int dc_mode_cost,uint8_t * best_palette_color_map,MB_MODE_INFO * const best_mbmi,int64_t * best_rd,int * rate,int * rate_tokenonly,int64_t * distortion,int * skippable)5759 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
5760                                        int dc_mode_cost,
5761                                        uint8_t *best_palette_color_map,
5762                                        MB_MODE_INFO *const best_mbmi,
5763                                        int64_t *best_rd, int *rate,
5764                                        int *rate_tokenonly, int64_t *distortion,
5765                                        int *skippable) {
5766   MACROBLOCKD *const xd = &x->e_mbd;
5767   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
5768   assert(!is_inter_block(mbmi));
5769   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
5770   const BLOCK_SIZE bsize = mbmi->sb_type;
5771   assert(bsize >= BLOCK_8X8);
5772   int this_rate;
5773   int64_t this_rd;
5774   int colors_u, colors_v, colors;
5775   const int src_stride = x->plane[1].src.stride;
5776   const uint8_t *const src_u = x->plane[1].src.buf;
5777   const uint8_t *const src_v = x->plane[2].src.buf;
5778   uint8_t *const color_map = xd->plane[1].color_index_map;
5779   RD_STATS tokenonly_rd_stats;
5780   int plane_block_width, plane_block_height, rows, cols;
5781   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
5782                            &plane_block_height, &rows, &cols);
5783   if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
5784 
5785   mbmi->uv_mode = UV_DC_PRED;
5786 #if CONFIG_FILTER_INTRA
5787   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
5788 #endif  // CONFIG_FILTER_INTRA
5789 
5790 #if CONFIG_HIGHBITDEPTH
5791   if (cpi->common.use_highbitdepth) {
5792     colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
5793                                        cpi->common.bit_depth);
5794     colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
5795                                        cpi->common.bit_depth);
5796   } else {
5797 #endif  // CONFIG_HIGHBITDEPTH
5798     colors_u = av1_count_colors(src_u, src_stride, rows, cols);
5799     colors_v = av1_count_colors(src_v, src_stride, rows, cols);
5800 #if CONFIG_HIGHBITDEPTH
5801   }
5802 #endif  // CONFIG_HIGHBITDEPTH
5803 
5804 #if CONFIG_PALETTE_DELTA_ENCODING
5805   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
5806   const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
5807 #endif  // CONFIG_PALETTE_DELTA_ENCODING
5808 
5809   colors = colors_u > colors_v ? colors_u : colors_v;
5810   if (colors > 1 && colors <= 64) {
5811     int r, c, n, i, j;
5812     const int max_itr = 50;
5813     float lb_u, ub_u, val_u;
5814     float lb_v, ub_v, val_v;
5815     float *const data = x->palette_buffer->kmeans_data_buf;
5816     float centroids[2 * PALETTE_MAX_SIZE];
5817 
5818 #if CONFIG_HIGHBITDEPTH
5819     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
5820     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
5821     if (cpi->common.use_highbitdepth) {
5822       lb_u = src_u16[0];
5823       ub_u = src_u16[0];
5824       lb_v = src_v16[0];
5825       ub_v = src_v16[0];
5826     } else {
5827 #endif  // CONFIG_HIGHBITDEPTH
5828       lb_u = src_u[0];
5829       ub_u = src_u[0];
5830       lb_v = src_v[0];
5831       ub_v = src_v[0];
5832 #if CONFIG_HIGHBITDEPTH
5833     }
5834 #endif  // CONFIG_HIGHBITDEPTH
5835 
5836     for (r = 0; r < rows; ++r) {
5837       for (c = 0; c < cols; ++c) {
5838 #if CONFIG_HIGHBITDEPTH
5839         if (cpi->common.use_highbitdepth) {
5840           val_u = src_u16[r * src_stride + c];
5841           val_v = src_v16[r * src_stride + c];
5842           data[(r * cols + c) * 2] = val_u;
5843           data[(r * cols + c) * 2 + 1] = val_v;
5844         } else {
5845 #endif  // CONFIG_HIGHBITDEPTH
5846           val_u = src_u[r * src_stride + c];
5847           val_v = src_v[r * src_stride + c];
5848           data[(r * cols + c) * 2] = val_u;
5849           data[(r * cols + c) * 2 + 1] = val_v;
5850 #if CONFIG_HIGHBITDEPTH
5851         }
5852 #endif  // CONFIG_HIGHBITDEPTH
5853         if (val_u < lb_u)
5854           lb_u = val_u;
5855         else if (val_u > ub_u)
5856           ub_u = val_u;
5857         if (val_v < lb_v)
5858           lb_v = val_v;
5859         else if (val_v > ub_v)
5860           ub_v = val_v;
5861       }
5862     }
5863 
5864     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
5865          --n) {
5866       for (i = 0; i < n; ++i) {
5867         centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
5868         centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
5869       }
5870       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
5871 #if CONFIG_PALETTE_DELTA_ENCODING
5872       optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
5873       // Sort the U channel colors in ascending order.
5874       for (i = 0; i < 2 * (n - 1); i += 2) {
5875         int min_idx = i;
5876         float min_val = centroids[i];
5877         for (j = i + 2; j < 2 * n; j += 2)
5878           if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
5879         if (min_idx != i) {
5880           float temp_u = centroids[i], temp_v = centroids[i + 1];
5881           centroids[i] = centroids[min_idx];
5882           centroids[i + 1] = centroids[min_idx + 1];
5883           centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
5884         }
5885       }
5886       av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
5887 #endif  // CONFIG_PALETTE_DELTA_ENCODING
5888       extend_palette_color_map(color_map, cols, rows, plane_block_width,
5889                                plane_block_height);
5890       pmi->palette_size[1] = n;
5891       for (i = 1; i < 3; ++i) {
5892         for (j = 0; j < n; ++j) {
5893 #if CONFIG_HIGHBITDEPTH
5894           if (cpi->common.use_highbitdepth)
5895             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
5896                 (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
5897           else
5898 #endif  // CONFIG_HIGHBITDEPTH
5899             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
5900                 clip_pixel((int)centroids[j * 2 + i - 1]);
5901         }
5902       }
5903 
5904       super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
5905       if (tokenonly_rd_stats.rate == INT_MAX) continue;
5906       this_rate =
5907           tokenonly_rd_stats.rate + dc_mode_cost +
5908           x->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
5909           write_uniform_cost(n, color_map[0]) +
5910           av1_cost_bit(
5911               av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
5912       this_rate += av1_palette_color_cost_uv(pmi,
5913 #if CONFIG_PALETTE_DELTA_ENCODING
5914                                              color_cache, n_cache,
5915 #endif  // CONFIG_PALETTE_DELTA_ENCODING
5916                                              cpi->common.bit_depth);
5917       this_rate +=
5918           av1_cost_color_map(x, 1, 0, bsize, mbmi->tx_size, PALETTE_MAP);
5919       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
5920       if (this_rd < *best_rd) {
5921         *best_rd = this_rd;
5922         *best_mbmi = *mbmi;
5923         memcpy(best_palette_color_map, color_map,
5924                plane_block_width * plane_block_height *
5925                    sizeof(best_palette_color_map[0]));
5926         *rate = this_rate;
5927         *distortion = tokenonly_rd_stats.dist;
5928         *rate_tokenonly = tokenonly_rd_stats.rate;
5929         *skippable = tokenonly_rd_stats.skip;
5930       }
5931     }
5932   }
5933   if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
5934     memcpy(color_map, best_palette_color_map,
5935            plane_block_width * plane_block_height *
5936                sizeof(best_palette_color_map[0]));
5937   }
5938 }
5939 
5940 #if CONFIG_FILTER_INTRA
5941 // Return 1 if an filter intra mode is selected; return 0 otherwise.
rd_pick_filter_intra_sbuv(const AV1_COMP * const cpi,MACROBLOCK * x,int * rate,int * rate_tokenonly,int64_t * distortion,int * skippable,BLOCK_SIZE bsize,int64_t * best_rd)5942 static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
5943                                      int *rate, int *rate_tokenonly,
5944                                      int64_t *distortion, int *skippable,
5945                                      BLOCK_SIZE bsize, int64_t *best_rd) {
5946   MACROBLOCKD *const xd = &x->e_mbd;
5947   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
5948   int filter_intra_selected_flag = 0;
5949   int this_rate;
5950   int64_t this_rd;
5951   FILTER_INTRA_MODE mode;
5952   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
5953   RD_STATS tokenonly_rd_stats;
5954 
5955   av1_zero(filter_intra_mode_info);
5956   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
5957   mbmi->uv_mode = UV_DC_PRED;
5958   mbmi->palette_mode_info.palette_size[1] = 0;
5959 
5960   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
5961     mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
5962     if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd))
5963       continue;
5964 
5965     this_rate = tokenonly_rd_stats.rate +
5966                 av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
5967                 x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
5968                 write_uniform_cost(FILTER_INTRA_MODES, mode);
5969     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
5970     if (this_rd < *best_rd) {
5971       *best_rd = this_rd;
5972       *rate = this_rate;
5973       *rate_tokenonly = tokenonly_rd_stats.rate;
5974       *distortion = tokenonly_rd_stats.dist;
5975       *skippable = tokenonly_rd_stats.skip;
5976       filter_intra_mode_info = mbmi->filter_intra_mode_info;
5977       filter_intra_selected_flag = 1;
5978     }
5979   }
5980 
5981   if (filter_intra_selected_flag) {
5982     mbmi->uv_mode = UV_DC_PRED;
5983     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
5984         filter_intra_mode_info.use_filter_intra_mode[1];
5985     mbmi->filter_intra_mode_info.filter_intra_mode[1] =
5986         filter_intra_mode_info.filter_intra_mode[1];
5987     return 1;
5988   } else {
5989     return 0;
5990   }
5991 }
5992 #endif  // CONFIG_FILTER_INTRA
5993 
5994 #if CONFIG_EXT_INTRA
5995 // Run RD calculation with given chroma intra prediction angle., and return
5996 // the RD cost. Update the best mode info. if the RD cost is the best so far.
pick_intra_angle_routine_sbuv(const AV1_COMP * const cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int rate_overhead,int64_t best_rd_in,int * rate,RD_STATS * rd_stats,int * best_angle_delta,int64_t * best_rd)5997 static int64_t pick_intra_angle_routine_sbuv(
5998     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
5999     int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
6000     int *best_angle_delta, int64_t *best_rd) {
6001   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
6002   assert(!is_inter_block(mbmi));
6003   int this_rate;
6004   int64_t this_rd;
6005   RD_STATS tokenonly_rd_stats;
6006 
6007   if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
6008     return INT64_MAX;
6009   this_rate = tokenonly_rd_stats.rate + rate_overhead;
6010   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
6011   if (this_rd < *best_rd) {
6012     *best_rd = this_rd;
6013     *best_angle_delta = mbmi->angle_delta[1];
6014     *rate = this_rate;
6015     rd_stats->rate = tokenonly_rd_stats.rate;
6016     rd_stats->dist = tokenonly_rd_stats.dist;
6017     rd_stats->skip = tokenonly_rd_stats.skip;
6018   }
6019   return this_rd;
6020 }
6021 
6022 // With given chroma directional intra prediction mode, pick the best angle
6023 // delta. Return true if a RD cost that is smaller than the input one is found.
rd_pick_intra_angle_sbuv(const AV1_COMP * const cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int rate_overhead,int64_t best_rd,int * rate,RD_STATS * rd_stats)6024 static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
6025                                     BLOCK_SIZE bsize, int rate_overhead,
6026                                     int64_t best_rd, int *rate,
6027                                     RD_STATS *rd_stats) {
6028   MACROBLOCKD *const xd = &x->e_mbd;
6029   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6030   assert(!is_inter_block(mbmi));
6031   int i, angle_delta, best_angle_delta = 0;
6032   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
6033 
6034   rd_stats->rate = INT_MAX;
6035   rd_stats->skip = 0;
6036   rd_stats->dist = INT64_MAX;
6037   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
6038 
6039   for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
6040     for (i = 0; i < 2; ++i) {
6041       best_rd_in = (best_rd == INT64_MAX)
6042                        ? INT64_MAX
6043                        : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
6044       mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
6045       this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
6046                                               best_rd_in, rate, rd_stats,
6047                                               &best_angle_delta, &best_rd);
6048       rd_cost[2 * angle_delta + i] = this_rd;
6049       if (angle_delta == 0) {
6050         if (this_rd == INT64_MAX) return 0;
6051         rd_cost[1] = this_rd;
6052         break;
6053       }
6054     }
6055   }
6056 
6057   assert(best_rd != INT64_MAX);
6058   for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
6059     int64_t rd_thresh;
6060     for (i = 0; i < 2; ++i) {
6061       int skip_search = 0;
6062       rd_thresh = best_rd + (best_rd >> 5);
6063       if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
6064           rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
6065         skip_search = 1;
6066       if (!skip_search) {
6067         mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
6068         pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
6069                                       rate, rd_stats, &best_angle_delta,
6070                                       &best_rd);
6071       }
6072     }
6073   }
6074 
6075   mbmi->angle_delta[1] = best_angle_delta;
6076   return rd_stats->rate != INT_MAX;
6077 }
6078 #endif  // CONFIG_EXT_INTRA
6079 
6080 #if CONFIG_CFL
cfl_alpha_dist_lbd(const int16_t * pred_buf_q3,const uint8_t * src,int src_stride,int width,int height,int dc_pred,int alpha_q3,int64_t * dist_neg_out)6081 static int64_t cfl_alpha_dist_lbd(const int16_t *pred_buf_q3,
6082                                   const uint8_t *src, int src_stride, int width,
6083                                   int height, int dc_pred, int alpha_q3,
6084                                   int64_t *dist_neg_out) {
6085   int64_t dist = 0;
6086   int diff;
6087 
6088   if (alpha_q3 == 0) {
6089     for (int j = 0; j < height; j++) {
6090       for (int i = 0; i < width; i++) {
6091         diff = src[i] - dc_pred;
6092         dist += diff * diff;
6093       }
6094       src += src_stride;
6095     }
6096 
6097     if (dist_neg_out) *dist_neg_out = dist;
6098 
6099     return dist;
6100   }
6101 
6102   int64_t dist_neg = 0;
6103   for (int j = 0; j < height; j++) {
6104     for (int i = 0; i < width; i++) {
6105       const int uv = src[i];
6106       const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
6107 
6108       diff = uv - clip_pixel(scaled_luma + dc_pred);
6109       dist += diff * diff;
6110 
6111       diff = uv - clip_pixel(-scaled_luma + dc_pred);
6112       dist_neg += diff * diff;
6113     }
6114     pred_buf_q3 += MAX_SB_SIZE;
6115     src += src_stride;
6116   }
6117 
6118   if (dist_neg_out) *dist_neg_out = dist_neg;
6119 
6120   return dist;
6121 }
6122 #if CONFIG_HIGHBITDEPTH
cfl_alpha_dist_hbd(const int16_t * pred_buf_q3,const uint16_t * src,int src_stride,int width,int height,int dc_pred,int alpha_q3,int bit_depth,int64_t * dist_neg_out)6123 static int64_t cfl_alpha_dist_hbd(const int16_t *pred_buf_q3,
6124                                   const uint16_t *src, int src_stride,
6125                                   int width, int height, int dc_pred,
6126                                   int alpha_q3, int bit_depth,
6127                                   int64_t *dist_neg_out) {
6128   const int shift = 2 * (bit_depth - 8);
6129   const int rounding = shift > 0 ? (1 << shift) >> 1 : 0;
6130   int64_t dist = 0;
6131   int diff;
6132 
6133   if (alpha_q3 == 0) {
6134     for (int j = 0; j < height; j++) {
6135       for (int i = 0; i < width; i++) {
6136         diff = src[i] - dc_pred;
6137         dist += diff * diff;
6138       }
6139       src += src_stride;
6140     }
6141     dist = (dist + rounding) >> shift;
6142 
6143     if (dist_neg_out) *dist_neg_out = dist;
6144 
6145     return dist;
6146   }
6147 
6148   int64_t dist_neg = 0;
6149   for (int j = 0; j < height; j++) {
6150     for (int i = 0; i < width; i++) {
6151       const int uv = src[i];
6152       const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
6153 
6154       diff = uv - clip_pixel_highbd(scaled_luma + dc_pred, bit_depth);
6155       dist += diff * diff;
6156 
6157       diff = uv - clip_pixel_highbd(-scaled_luma + dc_pred, bit_depth);
6158       dist_neg += diff * diff;
6159     }
6160     pred_buf_q3 += MAX_SB_SIZE;
6161     src += src_stride;
6162   }
6163 
6164   if (dist_neg_out) *dist_neg_out = (dist_neg + rounding) >> shift;
6165 
6166   return (dist + rounding) >> shift;
6167 }
6168 #endif  // CONFIG_HIGHBITDEPTH
cfl_alpha_dist(const int16_t * pred_buf_q3,const uint8_t * src,int src_stride,int width,int height,int dc_pred,int alpha_q3,int use_hbd,int bit_depth,int64_t * dist_neg_out)6169 static int64_t cfl_alpha_dist(const int16_t *pred_buf_q3, const uint8_t *src,
6170                               int src_stride, int width, int height,
6171                               int dc_pred, int alpha_q3, int use_hbd,
6172                               int bit_depth, int64_t *dist_neg_out) {
6173 #if CONFIG_HIGHBITDEPTH
6174   if (use_hbd) {
6175     const uint16_t *src_16 = CONVERT_TO_SHORTPTR(src);
6176     return cfl_alpha_dist_hbd(pred_buf_q3, src_16, src_stride, width, height,
6177                               dc_pred, alpha_q3, bit_depth, dist_neg_out);
6178   }
6179 #endif  // CONFIG_HIGHBITDEPTH
6180   (void)use_hbd;
6181   (void)bit_depth;
6182   return cfl_alpha_dist_lbd(pred_buf_q3, src, src_stride, width, height,
6183                             dc_pred, alpha_q3, dist_neg_out);
6184 }
6185 
cfl_rd_pick_alpha(MACROBLOCK * const x,TX_SIZE tx_size)6186 static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
6187   const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
6188   const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
6189   const uint8_t *const src_u = p_u->src.buf;
6190   const uint8_t *const src_v = p_v->src.buf;
6191   const int src_stride_u = p_u->src.stride;
6192   const int src_stride_v = p_v->src.stride;
6193 
6194   MACROBLOCKD *const xd = &x->e_mbd;
6195   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6196 
6197   CFL_CTX *const cfl = xd->cfl;
6198   cfl_compute_parameters(xd, tx_size);
6199   const int width = cfl->uv_width;
6200   const int height = cfl->uv_height;
6201   const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
6202   const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
6203   const int16_t *pred_buf_q3 = cfl->pred_buf_q3;
6204   const int use_hbd = get_bitdepth_data_path_index(xd);
6205 
6206   int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
6207   sse[CFL_PRED_U][0] =
6208       cfl_alpha_dist(pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
6209                      0, use_hbd, xd->bd, NULL);
6210   sse[CFL_PRED_V][0] =
6211       cfl_alpha_dist(pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
6212                      0, use_hbd, xd->bd, NULL);
6213 
6214   for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
6215     const int m = c * 2 + 1;
6216     const int abs_alpha_q3 = c + 1;
6217     sse[CFL_PRED_U][m] = cfl_alpha_dist(
6218         pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
6219         abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_U][m + 1]);
6220     sse[CFL_PRED_V][m] = cfl_alpha_dist(
6221         pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
6222         abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_V][m + 1]);
6223   }
6224 
6225   int64_t dist;
6226   int64_t cost;
6227   int64_t best_cost = INT64_MAX;
6228   int best_rate = 0;
6229 
6230   // Compute least squares parameter of the entire block
6231   int ind = 0;
6232   int signs = 0;
6233 
6234   for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
6235     const int sign_u = CFL_SIGN_U(joint_sign);
6236     const int sign_v = CFL_SIGN_V(joint_sign);
6237     const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
6238     const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
6239     for (int u = 0; u < size_u; u++) {
6240       const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1;
6241       for (int v = 0; v < size_v; v++) {
6242         const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1;
6243         dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
6244                sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
6245         dist *= 16;
6246         const int rate = x->cfl_cost[joint_sign][CFL_PRED_U][u] +
6247                          x->cfl_cost[joint_sign][CFL_PRED_V][v];
6248         cost = RDCOST(x->rdmult, rate, dist);
6249         if (cost < best_cost) {
6250           best_cost = cost;
6251           best_rate = rate;
6252           ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
6253           signs = joint_sign;
6254         }
6255       }
6256     }
6257   }
6258 
6259   mbmi->cfl_alpha_idx = ind;
6260   mbmi->cfl_alpha_signs = signs;
6261   return best_rate;
6262 }
6263 #endif  // CONFIG_CFL
6264 
init_sbuv_mode(MB_MODE_INFO * const mbmi)6265 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
6266   mbmi->uv_mode = UV_DC_PRED;
6267   mbmi->palette_mode_info.palette_size[1] = 0;
6268 #if CONFIG_FILTER_INTRA
6269   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
6270 #endif  // CONFIG_FILTER_INTRA
6271 }
6272 
rd_pick_intra_sbuv_mode(const AV1_COMP * const cpi,MACROBLOCK * x,int * rate,int * rate_tokenonly,int64_t * distortion,int * skippable,BLOCK_SIZE bsize,TX_SIZE max_tx_size)6273 static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
6274                                        int *rate, int *rate_tokenonly,
6275                                        int64_t *distortion, int *skippable,
6276                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
6277   MACROBLOCKD *xd = &x->e_mbd;
6278   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6279   assert(!is_inter_block(mbmi));
6280   MB_MODE_INFO best_mbmi = *mbmi;
6281   int64_t best_rd = INT64_MAX, this_rd;
6282 #if CONFIG_PVQ
6283   od_rollback_buffer buf;
6284   od_encode_checkpoint(&x->daala_enc, &buf);
6285 #endif  // CONFIG_PVQ
6286   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
6287   const int try_palette =
6288       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
6289 
6290   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
6291     int this_rate;
6292     RD_STATS tokenonly_rd_stats;
6293     UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
6294 #if CONFIG_EXT_INTRA
6295     const int is_directional_mode =
6296         av1_is_directional_mode(get_uv_mode(mode), mbmi->sb_type);
6297 #endif  // CONFIG_EXT_INTRA
6298     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
6299           (1 << mode)))
6300       continue;
6301 
6302     mbmi->uv_mode = mode;
6303 #if CONFIG_CFL
6304     int cfl_alpha_rate = 0;
6305     if (mode == UV_CFL_PRED) {
6306       assert(!is_directional_mode);
6307       const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
6308       cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
6309     }
6310 #endif
6311 #if CONFIG_EXT_INTRA
6312     mbmi->angle_delta[1] = 0;
6313     if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
6314       const int rate_overhead = x->intra_uv_mode_cost[mbmi->mode][mode] +
6315                                 write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
6316       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
6317                                     &this_rate, &tokenonly_rd_stats))
6318         continue;
6319     } else {
6320 #endif  // CONFIG_EXT_INTRA
6321       if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
6322 #if CONFIG_PVQ
6323         od_encode_rollback(&x->daala_enc, &buf);
6324 #endif  // CONFIG_PVQ
6325         continue;
6326       }
6327 #if CONFIG_EXT_INTRA
6328     }
6329 #endif  // CONFIG_EXT_INTRA
6330     this_rate =
6331         tokenonly_rd_stats.rate + x->intra_uv_mode_cost[mbmi->mode][mode];
6332 
6333 #if CONFIG_CFL
6334     if (mode == UV_CFL_PRED) {
6335       this_rate += cfl_alpha_rate;
6336     }
6337 #endif
6338 #if CONFIG_EXT_INTRA
6339     if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
6340       this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
6341                                       MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
6342     }
6343 #endif  // CONFIG_EXT_INTRA
6344 #if CONFIG_FILTER_INTRA
6345     if (mbmi->sb_type >= BLOCK_8X8 && mode == UV_DC_PRED)
6346       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
6347 #endif  // CONFIG_FILTER_INTRA
6348     if (try_palette && mode == UV_DC_PRED)
6349       this_rate += av1_cost_bit(
6350           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
6351 
6352 #if CONFIG_PVQ
6353     od_encode_rollback(&x->daala_enc, &buf);
6354 #endif  // CONFIG_PVQ
6355     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
6356 
6357     if (this_rd < best_rd) {
6358       best_mbmi = *mbmi;
6359       best_rd = this_rd;
6360       *rate = this_rate;
6361       *rate_tokenonly = tokenonly_rd_stats.rate;
6362       *distortion = tokenonly_rd_stats.dist;
6363       *skippable = tokenonly_rd_stats.skip;
6364     }
6365   }
6366 
6367   if (try_palette) {
6368     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
6369     rd_pick_palette_intra_sbuv(cpi, x,
6370                                x->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
6371                                best_palette_color_map, &best_mbmi, &best_rd,
6372                                rate, rate_tokenonly, distortion, skippable);
6373   }
6374 
6375 #if CONFIG_FILTER_INTRA
6376   if (mbmi->sb_type >= BLOCK_8X8) {
6377     if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
6378                                   skippable, bsize, &best_rd))
6379       best_mbmi = *mbmi;
6380   }
6381 #endif  // CONFIG_FILTER_INTRA
6382 
6383   *mbmi = best_mbmi;
6384   // Make sure we actually chose a mode
6385   assert(best_rd < INT64_MAX);
6386   return best_rd;
6387 }
6388 
choose_intra_uv_mode(const AV1_COMP * const cpi,MACROBLOCK * const x,BLOCK_SIZE bsize,TX_SIZE max_tx_size,int * rate_uv,int * rate_uv_tokenonly,int64_t * dist_uv,int * skip_uv,UV_PREDICTION_MODE * mode_uv)6389 static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
6390                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
6391                                  int *rate_uv, int *rate_uv_tokenonly,
6392                                  int64_t *dist_uv, int *skip_uv,
6393                                  UV_PREDICTION_MODE *mode_uv) {
6394   MACROBLOCKD *xd = &x->e_mbd;
6395   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6396   // Use an estimated rd for uv_intra based on DC_PRED if the
6397   // appropriate speed flag is set.
6398   init_sbuv_mode(mbmi);
6399 #if CONFIG_CB4X4
6400 #if !CONFIG_CHROMA_2X2
6401   if (x->skip_chroma_rd) {
6402     *rate_uv = 0;
6403     *rate_uv_tokenonly = 0;
6404     *dist_uv = 0;
6405     *skip_uv = 1;
6406     *mode_uv = UV_DC_PRED;
6407     return;
6408   }
6409   bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
6410                              xd->plane[AOM_PLANE_U].subsampling_y);
6411 #endif  // !CONFIG_CHROMA_2X2
6412 #if CONFIG_CFL
6413   // Only store reconstructed luma when there's chroma RDO. When there's no
6414   // chroma RDO, the reconstructed luma will be stored in encode_superblock().
6415   xd->cfl->store_y = !x->skip_chroma_rd;
6416 #endif  // CONFIG_CFL
6417 #else
6418   bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize;
6419 #if CONFIG_CFL
6420   xd->cfl->store_y = 1;
6421 #endif  // CONFIG_CFL
6422 #endif  // CONFIG_CB4X4
6423 #if CONFIG_CFL
6424   if (xd->cfl->store_y) {
6425     // Perform one extra call to txfm_rd_in_plane(), with the values chosen
6426     // during luma RDO, so we can store reconstructed luma values
6427     RD_STATS this_rd_stats;
6428     txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
6429                      mbmi->sb_type, mbmi->tx_size,
6430                      cpi->sf.use_fast_coef_costing);
6431     xd->cfl->store_y = 0;
6432   }
6433 #endif  // CONFIG_CFL
6434   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
6435                           bsize, max_tx_size);
6436   *mode_uv = mbmi->uv_mode;
6437 }
6438 
cost_mv_ref(const MACROBLOCK * const x,PREDICTION_MODE mode,int16_t mode_context)6439 static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
6440                        int16_t mode_context) {
6441   if (is_inter_compound_mode(mode)) {
6442     return x
6443         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
6444 #if CONFIG_COMPOUND_SINGLEREF
6445   } else if (is_inter_singleref_comp_mode(mode)) {
6446     return x->inter_singleref_comp_mode_cost[mode_context]
6447                                             [INTER_SINGLEREF_COMP_OFFSET(mode)];
6448 #endif  // CONFIG_COMPOUND_SINGLEREF
6449   }
6450 
6451   int mode_cost = 0;
6452   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
6453   int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
6454 
6455   assert(is_inter_mode(mode));
6456 
6457   if (mode == NEWMV) {
6458     mode_cost = x->newmv_mode_cost[mode_ctx][0];
6459     return mode_cost;
6460   } else {
6461     mode_cost = x->newmv_mode_cost[mode_ctx][1];
6462     mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
6463 
6464     if (is_all_zero_mv) return mode_cost;
6465 
6466     if (mode == ZEROMV) {
6467       mode_cost += x->zeromv_mode_cost[mode_ctx][0];
6468       return mode_cost;
6469     } else {
6470       mode_cost += x->zeromv_mode_cost[mode_ctx][1];
6471       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
6472 
6473       if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
6474       if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
6475       if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
6476 
6477       mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
6478       return mode_cost;
6479     }
6480   }
6481 }
6482 
6483 #if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
get_interinter_compound_type_bits(BLOCK_SIZE bsize,COMPOUND_TYPE comp_type)6484 static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
6485                                              COMPOUND_TYPE comp_type) {
6486   (void)bsize;
6487   switch (comp_type) {
6488     case COMPOUND_AVERAGE: return 0;
6489 #if CONFIG_WEDGE
6490     case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize);
6491 #endif  // CONFIG_WEDGE
6492 #if CONFIG_COMPOUND_SEGMENT
6493     case COMPOUND_SEG: return 1;
6494 #endif  // CONFIG_COMPOUND_SEGMENT
6495     default: assert(0); return 0;
6496   }
6497 }
6498 #endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
6499 
6500 typedef struct {
6501   int eobs;
6502   int brate;
6503   int byrate;
6504   int64_t bdist;
6505   int64_t bsse;
6506   int64_t brdcost;
6507   int_mv mvs[2];
6508   int_mv pred_mv[2];
6509   int_mv ref_mv[2];
6510 
6511 #if CONFIG_CHROMA_2X2
6512   ENTROPY_CONTEXT ta[4];
6513   ENTROPY_CONTEXT tl[4];
6514 #else
6515   ENTROPY_CONTEXT ta[2];
6516   ENTROPY_CONTEXT tl[2];
6517 #endif  // CONFIG_CHROMA_2X2
6518 } SEG_RDSTAT;
6519 
6520 typedef struct {
6521   int_mv *ref_mv[2];
6522   int_mv mvp;
6523 
6524   int64_t segment_rd;
6525   int r;
6526   int64_t d;
6527   int64_t sse;
6528   int segment_yrate;
6529   PREDICTION_MODE modes[4];
6530 #if CONFIG_COMPOUND_SINGLEREF
6531   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
6532                        INTER_COMPOUND_MODES];
6533 #else   // !CONFIG_COMPOUND_SINGLEREF
6534   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
6535 #endif  // CONFIG_COMPOUND_SINGLEREF
6536   int mvthresh;
6537 } BEST_SEG_INFO;
6538 
mv_check_bounds(const MvLimits * mv_limits,const MV * mv)6539 static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
6540   return (mv->row >> 3) < mv_limits->row_min ||
6541          (mv->row >> 3) > mv_limits->row_max ||
6542          (mv->col >> 3) < mv_limits->col_min ||
6543          (mv->col >> 3) > mv_limits->col_max;
6544 }
6545 
6546 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
6547 // TODO(aconverse): Find out if this is still productive then clean up or remove
check_best_zero_mv(const AV1_COMP * const cpi,const MACROBLOCK * const x,const int16_t mode_context[TOTAL_REFS_PER_FRAME],const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME],int this_mode,const MV_REFERENCE_FRAME ref_frames[2],const BLOCK_SIZE bsize,int block,int mi_row,int mi_col)6548 static int check_best_zero_mv(
6549     const AV1_COMP *const cpi, const MACROBLOCK *const x,
6550     const int16_t mode_context[TOTAL_REFS_PER_FRAME],
6551     const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
6552     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
6553     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
6554     int mi_row, int mi_col) {
6555   int_mv zeromv[2] = { {.as_int = 0 } };
6556 #if CONFIG_GLOBAL_MOTION
6557   int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
6558 #endif
6559   (void)mi_row;
6560   (void)mi_col;
6561   (void)cpi;
6562 #if CONFIG_GLOBAL_MOTION
6563   if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
6564     for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
6565       zeromv[cur_frm].as_int =
6566           gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
6567                                cpi->common.allow_high_precision_mv, bsize,
6568                                mi_col, mi_row, block
6569 #if CONFIG_AMVR
6570                                ,
6571                                cpi->common.cur_frame_mv_precision_level
6572 #endif
6573                                )
6574               .as_int;
6575     }
6576   }
6577 #endif  // CONFIG_GLOBAL_MOTION
6578 
6579   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
6580       frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
6581       (ref_frames[1] <= INTRA_FRAME ||
6582        frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
6583     int16_t rfc =
6584         av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
6585     int c1 = cost_mv_ref(x, NEARMV, rfc);
6586     int c2 = cost_mv_ref(x, NEARESTMV, rfc);
6587     int c3 = cost_mv_ref(x, ZEROMV, rfc);
6588 
6589     if (this_mode == NEARMV) {
6590       if (c1 > c3) return 0;
6591     } else if (this_mode == NEARESTMV) {
6592       if (c2 > c3) return 0;
6593     } else {
6594       assert(this_mode == ZEROMV);
6595       if (ref_frames[1] <= INTRA_FRAME) {
6596         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
6597             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
6598           return 0;
6599       } else {
6600         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
6601              frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
6602             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
6603              frame_mv[NEARMV][ref_frames[1]].as_int == 0))
6604           return 0;
6605       }
6606     }
6607   } else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
6608               this_mode == ZERO_ZEROMV) &&
6609              frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
6610              frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
6611     int16_t rfc = compound_mode_context[ref_frames[0]];
6612     int c2 = cost_mv_ref(x, NEAREST_NEARESTMV, rfc);
6613     int c3 = cost_mv_ref(x, ZERO_ZEROMV, rfc);
6614     int c5 = cost_mv_ref(x, NEAR_NEARMV, rfc);
6615 
6616     if (this_mode == NEAREST_NEARESTMV) {
6617       if (c2 > c3) return 0;
6618     } else if (this_mode == NEAR_NEARMV) {
6619       if (c5 > c3) return 0;
6620     } else {
6621       assert(this_mode == ZERO_ZEROMV);
6622       if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
6623            frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
6624           (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
6625            frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0))
6626         return 0;
6627     }
6628   }
6629   return 1;
6630 }
6631 
joint_motion_search(const AV1_COMP * cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int_mv * frame_mv,int_mv * frame_comp_mv,int mi_row,int mi_col,int_mv * ref_mv_sub8x8[2],const uint8_t * mask,int mask_stride,int * rate_mv,const int block)6632 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
6633                                 BLOCK_SIZE bsize, int_mv *frame_mv,
6634 #if CONFIG_COMPOUND_SINGLEREF
6635                                 int_mv *frame_comp_mv,
6636 #endif  // CONFIG_COMPOUND_SINGLEREF
6637                                 int mi_row, int mi_col,
6638                                 int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
6639                                 int mask_stride, int *rate_mv,
6640                                 const int block) {
6641   const AV1_COMMON *const cm = &cpi->common;
6642   const int pw = block_size_wide[bsize];
6643   const int ph = block_size_high[bsize];
6644   MACROBLOCKD *xd = &x->e_mbd;
6645   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6646 // This function should only ever be called for compound modes
6647 #if CONFIG_COMPOUND_SINGLEREF
6648   if (!has_second_ref(mbmi)) {
6649     assert(is_inter_singleref_comp_mode(mbmi->mode));
6650     assert(frame_comp_mv);
6651   }
6652   assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
6653   const int refs[2] = { mbmi->ref_frame[0],
6654                         has_second_ref(mbmi) ? mbmi->ref_frame[1]
6655                                              : mbmi->ref_frame[0] };
6656 #else
6657   assert(has_second_ref(mbmi));
6658   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
6659 #endif  // CONFIG_COMPOUND_SINGLEREF
6660   int_mv ref_mv[2];
6661   int ite, ref;
6662   struct scale_factors sf;
6663 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6664   // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
6665   const int ic = block & 1;
6666   const int ir = (block - ic) >> 1;
6667   struct macroblockd_plane *const pd = &xd->plane[0];
6668   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
6669   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
6670 #if CONFIG_GLOBAL_MOTION
6671   int is_global[2];
6672 #if CONFIG_COMPOUND_SINGLEREF
6673   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
6674 #else
6675   for (ref = 0; ref < 2; ++ref)
6676 #endif  // CONFIG_COMPOUND_SINGLEREF
6677   {
6678     WarpedMotionParams *const wm =
6679         &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
6680     is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
6681   }
6682 #if CONFIG_COMPOUND_SINGLEREF
6683   if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
6684 #endif  // CONFIG_COMPOUND_SINGLEREF
6685 #endif  // CONFIG_GLOBAL_MOTION
6686 #else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6687   (void)block;
6688 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6689 
6690   // Do joint motion search in compound mode to get more accurate mv.
6691   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
6692   int last_besterr[2] = { INT_MAX, INT_MAX };
6693   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
6694     av1_get_scaled_ref_frame(cpi, refs[0]),
6695     av1_get_scaled_ref_frame(cpi, refs[1])
6696   };
6697 
6698 // Prediction buffer from second frame.
6699 #if CONFIG_HIGHBITDEPTH
6700   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
6701   uint8_t *second_pred;
6702 #else
6703   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
6704 #endif  // CONFIG_HIGHBITDEPTH
6705 
6706 #if CONFIG_CB4X4
6707   (void)ref_mv_sub8x8;
6708 #endif  // CONFIG_CB4X4
6709 
6710 #if CONFIG_COMPOUND_SINGLEREF
6711   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
6712 #else
6713   for (ref = 0; ref < 2; ++ref)
6714 #endif  // CONFIG_COMPOUND_SINGLEREF
6715   {
6716 #if !CONFIG_CB4X4
6717     if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
6718       ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
6719     else
6720 #endif  // !CONFIG_CB4X4
6721       ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
6722 
6723     if (scaled_ref_frame[ref]) {
6724       int i;
6725       // Swap out the reference frame for a version that's been scaled to
6726       // match the resolution of the current frame, allowing the existing
6727       // motion search code to be used without additional modifications.
6728       for (i = 0; i < MAX_MB_PLANE; i++)
6729         backup_yv12[ref][i] = xd->plane[i].pre[ref];
6730       av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
6731                            NULL);
6732     }
6733   }
6734 
6735 #if CONFIG_COMPOUND_SINGLEREF
6736   if (!has_second_ref(mbmi)) {
6737     assert(is_inter_singleref_comp_mode(mbmi->mode));
6738     // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
6739     //       all from the 1st reference frame, i.e. refs[0].
6740     ref_mv[1] = x->mbmi_ext->ref_mvs[refs[0]][0];
6741     if (scaled_ref_frame[0]) {
6742       int i;
6743       // Swap out the reference frame for a version that's been scaled to
6744       // match the resolution of the current frame, allowing the existing
6745       // motion search code to be used without additional modifications.
6746       for (i = 0; i < MAX_MB_PLANE; i++)
6747         backup_yv12[1][i] = xd->plane[i].pre[1];
6748       av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
6749     }
6750   }
6751 #endif  // CONFIG_COMPOUND_SINGLEREF
6752 
6753 // Since we have scaled the reference frames to match the size of the current
6754 // frame we must use a unit scaling factor during mode selection.
6755 #if CONFIG_HIGHBITDEPTH
6756   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
6757                                     cm->height, cm->use_highbitdepth);
6758 #else
6759   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
6760                                     cm->height);
6761 #endif  // CONFIG_HIGHBITDEPTH
6762 
6763 // Allow joint search multiple times iteratively for each reference frame
6764 // and break out of the search loop if it couldn't find a better mv.
6765 #if CONFIG_COMPOUND_SINGLEREF
6766   const int num_ites =
6767       (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
6768   const int start_ite = has_second_ref(mbmi) ? 0 : 1;
6769   for (ite = start_ite; ite < (start_ite + num_ites); ite++)
6770 #else
6771   for (ite = 0; ite < 4; ite++)
6772 #endif  // CONFIG_COMPOUND_SINGLEREF
6773   {
6774     struct buf_2d ref_yv12[2];
6775     int bestsme = INT_MAX;
6776     int sadpb = x->sadperbit16;
6777     MV *const best_mv = &x->best_mv.as_mv;
6778     int search_range = 3;
6779 
6780     MvLimits tmp_mv_limits = x->mv_limits;
6781     int id = ite % 2;  // Even iterations search in the first reference frame,
6782                        // odd iterations search in the second. The predictor
6783                        // found for the 'other' reference frame is factored in.
6784     const int plane = 0;
6785     ConvolveParams conv_params = get_conv_params(!id, 0, plane);
6786 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6787     WarpTypesAllowed warp_types;
6788 #if CONFIG_GLOBAL_MOTION
6789     warp_types.global_warp_allowed = is_global[!id];
6790 #endif  // CONFIG_GLOBAL_MOTION
6791 #if CONFIG_WARPED_MOTION
6792     warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
6793 #endif  // CONFIG_WARPED_MOTION
6794 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6795 
6796     // Initialized here because of compiler problem in Visual Studio.
6797     ref_yv12[0] = xd->plane[plane].pre[0];
6798     ref_yv12[1] = xd->plane[plane].pre[1];
6799 
6800 // Get the prediction block from the 'other' reference frame.
6801 #if CONFIG_COMPOUND_SINGLEREF
6802     MV *const the_other_mv = (has_second_ref(mbmi) || id)
6803                                  ? &frame_mv[refs[!id]].as_mv
6804                                  : &frame_comp_mv[refs[0]].as_mv;
6805 #endif  // CONFIG_COMPOUND_SINGLEREF
6806 
6807 #if CONFIG_HIGHBITDEPTH
6808     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
6809       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
6810       av1_highbd_build_inter_predictor(
6811           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
6812 #if CONFIG_COMPOUND_SINGLEREF
6813           the_other_mv,
6814 #else   // !(CONFIG_COMPOUND_SINGLEREF)
6815           &frame_mv[refs[!id]].as_mv,
6816 #endif  // CONFIG_COMPOUND_SINGLEREF
6817           &sf, pw, ph, 0, mbmi->interp_filters,
6818 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6819           &warp_types, p_col, p_row,
6820 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6821           plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
6822     } else {
6823       second_pred = (uint8_t *)second_pred_alloc_16;
6824 #endif  // CONFIG_HIGHBITDEPTH
6825       av1_build_inter_predictor(
6826           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
6827 #if CONFIG_COMPOUND_SINGLEREF
6828           the_other_mv,
6829 #else   // !(CONFIG_COMPOUND_SINGLEREF)
6830         &frame_mv[refs[!id]].as_mv,
6831 #endif  // CONFIG_COMPOUND_SINGLEREF
6832           &sf, pw, ph, &conv_params, mbmi->interp_filters,
6833 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6834           &warp_types, p_col, p_row, plane, !id,
6835 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6836           MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
6837 #if CONFIG_HIGHBITDEPTH
6838     }
6839 #endif  // CONFIG_HIGHBITDEPTH
6840 
6841     // Do compound motion search on the current reference frame.
6842     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
6843     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
6844 
6845 // Use the mv result from the single mode as mv predictor.
6846 // Use the mv result from the single mode as mv predictor.
6847 #if CONFIG_COMPOUND_SINGLEREF
6848     if (!has_second_ref(mbmi) && id)
6849       *best_mv = frame_comp_mv[refs[0]].as_mv;
6850     else
6851 #endif  // CONFIG_COMPOUND_SINGLEREF
6852       *best_mv = frame_mv[refs[id]].as_mv;
6853 
6854     best_mv->col >>= 3;
6855     best_mv->row >>= 3;
6856 
6857 #if CONFIG_COMPOUND_SINGLEREF
6858     if (!has_second_ref(mbmi))
6859       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
6860     else
6861 #endif  // CONFIG_COMPOUND_SINGLEREF
6862       av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
6863 
6864     // Small-range full-pixel motion search.
6865     bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
6866                                        &cpi->fn_ptr[bsize], mask, mask_stride,
6867                                        id, &ref_mv[id].as_mv, second_pred);
6868     if (bestsme < INT_MAX) {
6869       if (mask)
6870         bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
6871                                           second_pred, mask, mask_stride, id,
6872                                           &cpi->fn_ptr[bsize], 1);
6873       else
6874         bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
6875                                         second_pred, &cpi->fn_ptr[bsize], 1);
6876     }
6877 
6878     x->mv_limits = tmp_mv_limits;
6879 
6880 #if CONFIG_AMVR
6881     if (cpi->common.cur_frame_mv_precision_level) {
6882       x->best_mv.as_mv.row *= 8;
6883       x->best_mv.as_mv.col *= 8;
6884     }
6885     if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0)
6886 #else
6887     if (bestsme < INT_MAX)
6888 #endif
6889     {
6890       int dis; /* TODO: use dis in distortion calculation later. */
6891       unsigned int sse;
6892       bestsme = cpi->find_fractional_mv_step(
6893           x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
6894           x->errorperbit, &cpi->fn_ptr[bsize], 0,
6895           cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
6896           &dis, &sse, second_pred, mask, mask_stride, id, pw, ph,
6897           cpi->sf.use_upsampled_references);
6898     }
6899 
6900     // Restore the pointer to the first (possibly scaled) prediction buffer.
6901     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
6902 
6903     if (bestsme < last_besterr[id]) {
6904 #if CONFIG_COMPOUND_SINGLEREF
6905       // NOTE: For single ref comp mode, frame_mv stores the first mv and
6906       //       frame_comp_mv stores the second mv.
6907       if (!has_second_ref(mbmi) && id)
6908         frame_comp_mv[refs[0]].as_mv = *best_mv;
6909       else
6910 #endif  // CONFIG_COMPOUND_SINGLEREF
6911         frame_mv[refs[id]].as_mv = *best_mv;
6912       last_besterr[id] = bestsme;
6913 #if CONFIG_COMPOUND_SINGLEREF
6914       if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
6915 #endif  // CONFIG_COMPOUND_SINGLEREF
6916     } else {
6917       break;
6918     }
6919   }
6920 
6921   *rate_mv = 0;
6922 
6923 #if CONFIG_COMPOUND_SINGLEREF
6924   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
6925 #else
6926   for (ref = 0; ref < 2; ++ref)
6927 #endif  // CONFIG_COMPOUND_SINGLEREF
6928   {
6929     if (scaled_ref_frame[ref]) {
6930       // Restore the prediction frame pointers to their unscaled versions.
6931       int i;
6932       for (i = 0; i < MAX_MB_PLANE; i++)
6933         xd->plane[i].pre[ref] = backup_yv12[ref][i];
6934     }
6935 
6936 #if CONFIG_COMPOUND_SINGLEREF
6937     if (!has_second_ref(mbmi))
6938       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
6939     else
6940 #endif  // CONFIG_COMPOUND_SINGLEREF
6941       av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
6942 
6943 #if CONFIG_COMPOUND_SINGLEREF
6944     if (!has_second_ref(mbmi)) {
6945       // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
6946       //       first mv is stored in frame_mv[] and the second mv is stored in
6947       //       frame_comp_mv[].
6948       if (compound_ref0_mode(mbmi->mode) == NEWMV)  // SR_NEW_NEWMV
6949         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
6950                                     &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
6951                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6952       assert(compound_ref1_mode(mbmi->mode) == NEWMV);
6953       *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
6954                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
6955                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6956     } else {
6957 #endif  // CONFIG_COMPOUND_SINGLEREF
6958 #if !CONFIG_CB4X4
6959       if (bsize >= BLOCK_8X8)
6960 #endif  // !CONFIG_CB4X4
6961         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
6962                                     &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
6963                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6964 #if !CONFIG_CB4X4
6965       else
6966         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
6967                                     &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
6968                                     x->mvcost, MV_COST_WEIGHT);
6969 #endif  // !CONFIG_CB4X4
6970 #if CONFIG_COMPOUND_SINGLEREF
6971     }
6972 #endif  // CONFIG_COMPOUND_SINGLEREF
6973   }
6974 
6975 #if CONFIG_COMPOUND_SINGLEREF
6976   if (!has_second_ref(mbmi)) {
6977     if (scaled_ref_frame[0]) {
6978       // Restore the prediction frame pointers to their unscaled versions.
6979       int i;
6980       for (i = 0; i < MAX_MB_PLANE; i++)
6981         xd->plane[i].pre[1] = backup_yv12[1][i];
6982     }
6983   }
6984 #endif  // CONFIG_COMPOUND_SINGLEREF
6985 }
6986 
estimate_ref_frame_costs(const AV1_COMMON * cm,const MACROBLOCKD * xd,int segment_id,unsigned int * ref_costs_single,unsigned int (* ref_costs_comp)[TOTAL_REFS_PER_FRAME],aom_prob * comp_mode_p)6987 static void estimate_ref_frame_costs(
6988     const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
6989     unsigned int *ref_costs_single,
6990 #if CONFIG_EXT_COMP_REFS
6991     unsigned int (*ref_costs_comp)[TOTAL_REFS_PER_FRAME],
6992 #else
6993     unsigned int *ref_costs_comp,
6994 #endif  // CONFIG_EXT_COMP_REFS
6995     aom_prob *comp_mode_p) {
6996   int seg_ref_active =
6997       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
6998   if (seg_ref_active) {
6999     memset(ref_costs_single, 0,
7000            TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
7001 #if CONFIG_EXT_COMP_REFS
7002     int ref_frame;
7003     for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
7004       memset(ref_costs_comp[ref_frame], 0,
7005              TOTAL_REFS_PER_FRAME * sizeof((*ref_costs_comp)[0]));
7006 #else
7007     memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
7008 #endif  // CONFIG_EXT_COMP_REFS
7009 
7010     *comp_mode_p = 128;
7011   } else {
7012     aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
7013     aom_prob comp_inter_p = 128;
7014 
7015     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
7016       comp_inter_p = av1_get_reference_mode_prob(cm, xd);
7017       *comp_mode_p = comp_inter_p;
7018     } else {
7019       *comp_mode_p = 128;
7020     }
7021 
7022     ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0);
7023 
7024     if (cm->reference_mode != COMPOUND_REFERENCE) {
7025       aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd);
7026       aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd);
7027 #if CONFIG_EXT_REFS
7028       aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
7029       aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
7030       aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
7031       aom_prob ref_single_p6 = av1_get_pred_prob_single_ref_p6(cm, xd);
7032 #endif  // CONFIG_EXT_REFS
7033 
7034       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
7035 
7036       ref_costs_single[LAST_FRAME] =
7037 #if CONFIG_EXT_REFS
7038           ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
7039               ref_costs_single[BWDREF_FRAME] = ref_costs_single[ALTREF2_FRAME] =
7040 #endif  // CONFIG_EXT_REFS
7041                   ref_costs_single[GOLDEN_FRAME] =
7042                       ref_costs_single[ALTREF_FRAME] = base_cost;
7043 
7044 #if CONFIG_EXT_REFS
7045       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
7046       ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0);
7047       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
7048       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
7049       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
7050       ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p1, 1);
7051       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
7052 
7053       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
7054       ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0);
7055       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1);
7056       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
7057 
7058       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
7059       ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p2, 0);
7060       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
7061 
7062       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
7063       ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1);
7064 
7065       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
7066       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
7067 
7068       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p6, 0);
7069       ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p6, 1);
7070 #else   // !CONFIG_EXT_REFS
7071       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
7072       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
7073       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
7074 
7075       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
7076       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
7077 #endif  // CONFIG_EXT_REFS
7078     } else {
7079       ref_costs_single[LAST_FRAME] = 512;
7080 #if CONFIG_EXT_REFS
7081       ref_costs_single[LAST2_FRAME] = 512;
7082       ref_costs_single[LAST3_FRAME] = 512;
7083       ref_costs_single[BWDREF_FRAME] = 512;
7084       ref_costs_single[ALTREF2_FRAME] = 512;
7085 #endif  // CONFIG_EXT_REFS
7086       ref_costs_single[GOLDEN_FRAME] = 512;
7087       ref_costs_single[ALTREF_FRAME] = 512;
7088     }
7089 
7090     if (cm->reference_mode != SINGLE_REFERENCE) {
7091       aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
7092 #if CONFIG_EXT_REFS
7093       aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
7094       aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
7095       aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
7096       aom_prob bwdref_comp_p1 = av1_get_pred_prob_comp_bwdref_p1(cm, xd);
7097 #endif  // CONFIG_EXT_REFS
7098 
7099       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
7100 
7101 #if CONFIG_EXT_COMP_REFS
7102       aom_prob comp_ref_type_p = av1_get_comp_reference_type_prob(cm, xd);
7103       unsigned int ref_bicomp_costs[TOTAL_REFS_PER_FRAME] = { 0 };
7104 
7105       ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
7106           ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
7107 #if USE_UNI_COMP_REFS
7108               base_cost + av1_cost_bit(comp_ref_type_p, 1);
7109 #else
7110               base_cost;
7111 #endif  // USE_UNI_COMP_REFS
7112       ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
7113       ref_bicomp_costs[ALTREF_FRAME] = 0;
7114 
7115       ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
7116       ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
7117       ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
7118       ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
7119 
7120       ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
7121       ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
7122 
7123       ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
7124       ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
7125 
7126       ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
7127       ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
7128       ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
7129 
7130       ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
7131       ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
7132 
7133       int ref0, ref1;
7134       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
7135         for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
7136           ref_costs_comp[ref0][ref1] =
7137               ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
7138         }
7139       }
7140 
7141       aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
7142       aom_prob uni_comp_ref_p1 = av1_get_pred_prob_uni_comp_ref_p1(cm, xd);
7143       aom_prob uni_comp_ref_p2 = av1_get_pred_prob_uni_comp_ref_p2(cm, xd);
7144 
7145       ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
7146           base_cost + av1_cost_bit(comp_ref_type_p, 0) +
7147           av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 0);
7148       ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
7149           base_cost + av1_cost_bit(comp_ref_type_p, 0) +
7150           av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
7151           av1_cost_bit(uni_comp_ref_p2, 0);
7152       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
7153           base_cost + av1_cost_bit(comp_ref_type_p, 0) +
7154           av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
7155           av1_cost_bit(uni_comp_ref_p2, 1);
7156 
7157       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
7158           base_cost + av1_cost_bit(comp_ref_type_p, 0) +
7159           av1_cost_bit(uni_comp_ref_p, 1);
7160 
7161 #else  // !CONFIG_EXT_COMP_REFS
7162 
7163       ref_costs_comp[LAST_FRAME] =
7164 #if CONFIG_EXT_REFS
7165           ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
7166 #endif  // CONFIG_EXT_REFS
7167               ref_costs_comp[GOLDEN_FRAME] = base_cost;
7168 
7169 #if CONFIG_EXT_REFS
7170       ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF2_FRAME] =
7171           ref_costs_comp[ALTREF_FRAME] = 0;
7172 #endif  // CONFIG_EXT_REFS
7173 
7174 #if CONFIG_EXT_REFS
7175       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
7176       ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
7177       ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
7178       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
7179 
7180       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
7181       ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
7182 
7183       ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
7184       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
7185 
7186       // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
7187       //               more bit.
7188       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
7189       ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
7190       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
7191 
7192       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
7193       ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
7194 #else   // !CONFIG_EXT_REFS
7195       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
7196       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
7197 #endif  // CONFIG_EXT_REFS
7198 #endif  // CONFIG_EXT_COMP_REFS
7199     } else {
7200 #if CONFIG_EXT_COMP_REFS
7201       int ref0, ref1;
7202       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
7203         for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
7204           ref_costs_comp[ref0][ref1] = 512;
7205       }
7206       ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
7207       ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
7208       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
7209       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
7210 #else  // !CONFIG_EXT_COMP_REFS
7211       ref_costs_comp[LAST_FRAME] = 512;
7212 #if CONFIG_EXT_REFS
7213       ref_costs_comp[LAST2_FRAME] = 512;
7214       ref_costs_comp[LAST3_FRAME] = 512;
7215       ref_costs_comp[BWDREF_FRAME] = 512;
7216       ref_costs_comp[ALTREF2_FRAME] = 512;
7217       ref_costs_comp[ALTREF_FRAME] = 512;
7218 #endif  // CONFIG_EXT_REFS
7219       ref_costs_comp[GOLDEN_FRAME] = 512;
7220 #endif  // CONFIG_EXT_COMP_REFS
7221     }
7222   }
7223 }
7224 
store_coding_context(MACROBLOCK * x,PICK_MODE_CONTEXT * ctx,int mode_index,int64_t comp_pred_diff[REFERENCE_MODES],int skippable)7225 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
7226                                  int mode_index,
7227                                  int64_t comp_pred_diff[REFERENCE_MODES],
7228                                  int skippable) {
7229   MACROBLOCKD *const xd = &x->e_mbd;
7230 
7231   // Take a snapshot of the coding context so it can be
7232   // restored if we decide to encode this way
7233   ctx->skip = x->skip;
7234   ctx->skippable = skippable;
7235   ctx->best_mode_index = mode_index;
7236   ctx->mic = *xd->mi[0];
7237   ctx->mbmi_ext = *x->mbmi_ext;
7238   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
7239   ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
7240   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
7241 }
7242 
setup_buffer_inter(const AV1_COMP * const cpi,MACROBLOCK * x,MV_REFERENCE_FRAME ref_frame,BLOCK_SIZE block_size,int mi_row,int mi_col,int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE])7243 static void setup_buffer_inter(
7244     const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
7245     BLOCK_SIZE block_size, int mi_row, int mi_col,
7246     int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
7247     int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
7248     struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) {
7249   const AV1_COMMON *cm = &cpi->common;
7250   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
7251   MACROBLOCKD *const xd = &x->e_mbd;
7252   MODE_INFO *const mi = xd->mi[0];
7253   int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
7254   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
7255   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
7256 
7257   assert(yv12 != NULL);
7258 
7259   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
7260   // use the UV scaling factors.
7261   av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
7262 
7263   // Gets an initial list of candidate vectors from neighbours and orders them
7264   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
7265                    mbmi_ext->ref_mv_stack[ref_frame],
7266                    mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
7267                    NULL, NULL, mbmi_ext->mode_context);
7268 
7269 // Candidate refinement carried out at encoder and decoder
7270 #if CONFIG_AMVR
7271   av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
7272                         &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
7273                         cm->cur_frame_mv_precision_level);
7274 #else
7275   av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
7276                         &frame_nearest_mv[ref_frame],
7277                         &frame_near_mv[ref_frame]);
7278 #endif
7279 // Further refinement that is encode side only to test the top few candidates
7280 // in full and choose the best as the centre point for subsequent searches.
7281 // The current implementation doesn't support scaling.
7282 #if CONFIG_CB4X4
7283   av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
7284               block_size);
7285 #else
7286   if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8)
7287     av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
7288                 block_size);
7289 #endif  // CONFIG_CB4X4
7290 }
7291 
single_motion_search(const AV1_COMP * const cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int mi_row,int mi_col,int ref_idx,int * rate_mv)7292 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
7293                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
7294                                  int ref_idx, int *rate_mv) {
7295   MACROBLOCKD *xd = &x->e_mbd;
7296   const AV1_COMMON *cm = &cpi->common;
7297   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
7298   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
7299   int bestsme = INT_MAX;
7300   int step_param;
7301   int sadpb = x->sadperbit16;
7302   MV mvp_full;
7303 #if CONFIG_COMPOUND_SINGLEREF
7304   int ref =
7305       has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
7306 #else   // !CONFIG_COMPOUND_SINGLEREF
7307   int ref = mbmi->ref_frame[ref_idx];
7308 #endif  // CONFIG_COMPOUND_SINGLEREF
7309   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
7310 
7311   MvLimits tmp_mv_limits = x->mv_limits;
7312   int cost_list[5];
7313 
7314   const YV12_BUFFER_CONFIG *scaled_ref_frame =
7315       av1_get_scaled_ref_frame(cpi, ref);
7316 
7317   MV pred_mv[3];
7318   pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
7319   pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
7320   pred_mv[2] = x->pred_mv[ref];
7321 
7322   if (scaled_ref_frame) {
7323     int i;
7324     // Swap out the reference frame for a version that's been scaled to
7325     // match the resolution of the current frame, allowing the existing
7326     // motion search code to be used without additional modifications.
7327     for (i = 0; i < MAX_MB_PLANE; i++)
7328       backup_yv12[i] = xd->plane[i].pre[ref_idx];
7329 
7330     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
7331   }
7332 
7333   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
7334 
7335   av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
7336 
7337   // Work out the size of the first step in the mv step search.
7338   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
7339   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
7340     // Take the weighted average of the step_params based on the last frame's
7341     // max mv magnitude and that based on the best ref mvs of the current
7342     // block for the given reference.
7343     step_param =
7344         (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
7345         2;
7346   } else {
7347     step_param = cpi->mv_step_param;
7348   }
7349 
7350   if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
7351     int boffset =
7352         2 * (b_width_log2_lookup[cm->sb_size] -
7353              AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
7354     step_param = AOMMAX(step_param, boffset);
7355   }
7356 
7357   if (cpi->sf.adaptive_motion_search) {
7358     int bwl = b_width_log2_lookup[bsize];
7359     int bhl = b_height_log2_lookup[bsize];
7360     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
7361 
7362     if (tlevel < 5) {
7363       step_param += 2;
7364       step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
7365     }
7366 
7367     // prev_mv_sad is not setup for dynamically scaled frames.
7368     if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
7369       int i;
7370       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
7371         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
7372           x->pred_mv[ref].row = 0;
7373           x->pred_mv[ref].col = 0;
7374           x->best_mv.as_int = INVALID_MV;
7375 
7376           if (scaled_ref_frame) {
7377             int j;
7378             for (j = 0; j < MAX_MB_PLANE; ++j)
7379               xd->plane[j].pre[ref_idx] = backup_yv12[j];
7380           }
7381           return;
7382         }
7383       }
7384     }
7385   }
7386 
7387   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
7388 
7389 #if CONFIG_MOTION_VAR
7390   if (mbmi->motion_mode != SIMPLE_TRANSLATION)
7391     mvp_full = mbmi->mv[0].as_mv;
7392   else
7393 #endif  // CONFIG_MOTION_VAR
7394     mvp_full = pred_mv[x->mv_best_ref_index[ref]];
7395 
7396   mvp_full.col >>= 3;
7397   mvp_full.row >>= 3;
7398 
7399   x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
7400 
7401 #if CONFIG_MOTION_VAR
7402   switch (mbmi->motion_mode) {
7403     case SIMPLE_TRANSLATION:
7404 #endif  // CONFIG_MOTION_VAR
7405 #if CONFIG_HASH_ME
7406       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
7407                                       sadpb, cond_cost_list(cpi, cost_list),
7408                                       &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
7409                                       (MI_SIZE * mi_row), 0);
7410 #else
7411   bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
7412                                   cond_cost_list(cpi, cost_list), &ref_mv,
7413                                   INT_MAX, 1);
7414 #endif
7415 #if CONFIG_MOTION_VAR
7416       break;
7417     case OBMC_CAUSAL:
7418       bestsme = av1_obmc_full_pixel_diamond(
7419           cpi, x, &mvp_full, step_param, sadpb,
7420           MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
7421           &(x->best_mv.as_mv), 0);
7422       break;
7423     default: assert(0 && "Invalid motion mode!\n");
7424   }
7425 #endif  // CONFIG_MOTION_VAR
7426 
7427   x->mv_limits = tmp_mv_limits;
7428 
7429 #if CONFIG_AMVR
7430   if (cpi->common.cur_frame_mv_precision_level) {
7431     x->best_mv.as_mv.row *= 8;
7432     x->best_mv.as_mv.col *= 8;
7433   }
7434   if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
7435 #else
7436   if (bestsme < INT_MAX) {
7437 #endif
7438     int dis; /* TODO: use dis in distortion calculation later. */
7439 #if CONFIG_MOTION_VAR
7440     switch (mbmi->motion_mode) {
7441       case SIMPLE_TRANSLATION:
7442 #endif  // CONFIG_MOTION_VAR
7443         if (cpi->sf.use_upsampled_references) {
7444           int best_mv_var;
7445           const int try_second = x->second_best_mv.as_int != INVALID_MV &&
7446                                  x->second_best_mv.as_int != x->best_mv.as_int;
7447           const int pw = block_size_wide[bsize];
7448           const int ph = block_size_high[bsize];
7449 
7450           best_mv_var = cpi->find_fractional_mv_step(
7451               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
7452               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
7453               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
7454               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
7455               0, 0, pw, ph, 1);
7456 
7457           if (try_second) {
7458             const int minc =
7459                 AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
7460             const int maxc =
7461                 AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
7462             const int minr =
7463                 AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
7464             const int maxr =
7465                 AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
7466             int this_var;
7467             MV best_mv = x->best_mv.as_mv;
7468 
7469             x->best_mv = x->second_best_mv;
7470             if (x->best_mv.as_mv.row * 8 <= maxr &&
7471                 x->best_mv.as_mv.row * 8 >= minr &&
7472                 x->best_mv.as_mv.col * 8 <= maxc &&
7473                 x->best_mv.as_mv.col * 8 >= minc) {
7474               this_var = cpi->find_fractional_mv_step(
7475                   x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
7476                   &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
7477                   cpi->sf.mv.subpel_iters_per_step,
7478                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
7479                   &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
7480               if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
7481               x->best_mv.as_mv = best_mv;
7482             }
7483           }
7484         } else {
7485           cpi->find_fractional_mv_step(
7486               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
7487               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
7488               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
7489               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
7490               0, 0, 0, 0, 0);
7491         }
7492 #if CONFIG_MOTION_VAR
7493         break;
7494       case OBMC_CAUSAL:
7495         av1_find_best_obmc_sub_pixel_tree_up(
7496             x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv,
7497             x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
7498             cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis,
7499             &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references);
7500         break;
7501       default: assert(0 && "Invalid motion mode!\n");
7502     }
7503 #endif  // CONFIG_MOTION_VAR
7504   }
7505   *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
7506                              x->mvcost, MV_COST_WEIGHT);
7507 
7508 #if CONFIG_MOTION_VAR
7509   if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
7510 #else
7511   if (cpi->sf.adaptive_motion_search)
7512 #endif  // CONFIG_MOTION_VAR
7513     x->pred_mv[ref] = x->best_mv.as_mv;
7514 
7515   if (scaled_ref_frame) {
7516     int i;
7517     for (i = 0; i < MAX_MB_PLANE; i++)
7518       xd->plane[i].pre[ref_idx] = backup_yv12[i];
7519   }
7520 }
7521 
7522 static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
7523   int i;
7524   for (i = 0; i < MAX_MB_PLANE; i++) {
7525     xd->plane[i].dst.buf = dst.plane[i];
7526     xd->plane[i].dst.stride = dst.stride[i];
7527   }
7528 }
7529 
7530 static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
7531                                     BLOCK_SIZE bsize, const MV *other_mv,
7532                                     int mi_row, int mi_col, const int block,
7533                                     int ref_idx, uint8_t *second_pred) {
7534   const AV1_COMMON *const cm = &cpi->common;
7535   const int pw = block_size_wide[bsize];
7536   const int ph = block_size_high[bsize];
7537   MACROBLOCKD *xd = &x->e_mbd;
7538   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
7539 #if CONFIG_COMPOUND_SINGLEREF
7540   const int other_ref =
7541       has_second_ref(mbmi) ? mbmi->ref_frame[!ref_idx] : mbmi->ref_frame[0];
7542 #else  // !CONFIG_COMPOUND_SINGLEREF
7543   const int other_ref = mbmi->ref_frame[!ref_idx];
7544 #endif  // CONFIG_COMPOUND_SINGLEREF
7545   struct scale_factors sf;
7546 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7547   struct macroblockd_plane *const pd = &xd->plane[0];
7548   // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
7549   const int ic = block & 1;
7550   const int ir = (block - ic) >> 1;
7551   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
7552   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
7553 #if CONFIG_GLOBAL_MOTION
7554   WarpedMotionParams *const wm = &xd->global_motion[other_ref];
7555   int is_global = is_global_mv_block(xd->mi[0], block, wm->wmtype);
7556 #endif  // CONFIG_GLOBAL_MOTION
7557 #else
7558   (void)block;
7559 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7560 
7561 // This function should only ever be called for compound modes
7562 #if CONFIG_COMPOUND_SINGLEREF
7563   assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
7564 #else   // !CONFIG_COMPOUND_SINGLEREF
7565   assert(has_second_ref(mbmi));
7566 #endif  // CONFIG_COMPOUND_SINGLEREF
7567 
7568   struct buf_2d backup_yv12[MAX_MB_PLANE];
7569   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
7570       av1_get_scaled_ref_frame(cpi, other_ref);
7571 
7572   if (scaled_ref_frame) {
7573     int i;
7574     // Swap out the reference frame for a version that's been scaled to
7575     // match the resolution of the current frame, allowing the existing
7576     // motion search code to be used without additional modifications.
7577     for (i = 0; i < MAX_MB_PLANE; i++)
7578       backup_yv12[i] = xd->plane[i].pre[!ref_idx];
7579     av1_setup_pre_planes(xd, !ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
7580   }
7581 
7582 // Since we have scaled the reference frames to match the size of the current
7583 // frame we must use a unit scaling factor during mode selection.
7584 #if CONFIG_HIGHBITDEPTH
7585   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
7586                                     cm->height, cm->use_highbitdepth);
7587 #else
7588   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
7589                                     cm->height);
7590 #endif  // CONFIG_HIGHBITDEPTH
7591 
7592   struct buf_2d ref_yv12;
7593 
7594   const int plane = 0;
7595   ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane);
7596 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7597   WarpTypesAllowed warp_types;
7598 #if CONFIG_GLOBAL_MOTION
7599   warp_types.global_warp_allowed = is_global;
7600 #endif  // CONFIG_GLOBAL_MOTION
7601 #if CONFIG_WARPED_MOTION
7602   warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
7603 #endif  // CONFIG_WARPED_MOTION
7604 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7605 
7606   // Initialized here because of compiler problem in Visual Studio.
7607   ref_yv12 = xd->plane[plane].pre[!ref_idx];
7608 
7609 // Get the prediction block from the 'other' reference frame.
7610 #if CONFIG_HIGHBITDEPTH
7611   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
7612     av1_highbd_build_inter_predictor(
7613         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
7614         0, mbmi->interp_filters,
7615 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7616         &warp_types, p_col, p_row,
7617 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7618         plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
7619   } else {
7620 #endif  // CONFIG_HIGHBITDEPTH
7621     av1_build_inter_predictor(
7622         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
7623         &conv_params, mbmi->interp_filters,
7624 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7625         &warp_types, p_col, p_row, plane, !ref_idx,
7626 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
7627         MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
7628 #if CONFIG_HIGHBITDEPTH
7629   }
7630 #endif  // CONFIG_HIGHBITDEPTH
7631 
7632   if (scaled_ref_frame) {
7633     // Restore the prediction frame pointers to their unscaled versions.
7634     int i;
7635     for (i = 0; i < MAX_MB_PLANE; i++)
7636       xd->plane[i].pre[!ref_idx] = backup_yv12[i];
7637   }
7638 }
7639 
7640 // Search for the best mv for one component of a compound,
7641 // given that the other component is fixed.
7642 static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
7643                                           BLOCK_SIZE bsize, MV *this_mv,
7644                                           int mi_row, int mi_col,
7645                                           const uint8_t *second_pred,
7646                                           const uint8_t *mask, int mask_stride,
7647                                           int *rate_mv, int ref_idx) {
7648   const int pw = block_size_wide[bsize];
7649   const int ph = block_size_high[bsize];
7650   MACROBLOCKD *xd = &x->e_mbd;
7651   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
7652 #if CONFIG_COMPOUND_SINGLEREF
7653   const int ref =
7654       has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
7655 #else
7656   const int ref = mbmi->ref_frame[ref_idx];
7657 #endif  // CONFIG_COMPOUND_SINGLEREF
7658   int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
7659   struct macroblockd_plane *const pd = &xd->plane[0];
7660 
7661   struct buf_2d backup_yv12[MAX_MB_PLANE];
7662   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
7663       av1_get_scaled_ref_frame(cpi, ref);
7664 
7665 // Check that this is either an interinter or an interintra block
7666 #if CONFIG_COMPOUND_SINGLEREF
7667   assert(has_second_ref(mbmi) ||
7668          // or a single ref comp pred mode
7669          is_inter_singleref_comp_mode(mbmi->mode) ||
7670          (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
7671 #else
7672   assert(has_second_ref(mbmi) ||
7673          (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
7674 #endif  // CONFIG_COMPOUND_SINGLEREF
7675 
7676   if (scaled_ref_frame) {
7677     int i;
7678     // Swap out the reference frame for a version that's been scaled to
7679     // match the resolution of the current frame, allowing the existing
7680     // motion search code to be used without additional modifications.
7681     for (i = 0; i < MAX_MB_PLANE; i++)
7682       backup_yv12[i] = xd->plane[i].pre[ref_idx];
7683     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
7684   }
7685 
7686   struct buf_2d orig_yv12;
7687   int bestsme = INT_MAX;
7688   int sadpb = x->sadperbit16;
7689   MV *const best_mv = &x->best_mv.as_mv;
7690   int search_range = 3;
7691 
7692   MvLimits tmp_mv_limits = x->mv_limits;
7693 
7694   // Initialized here because of compiler problem in Visual Studio.
7695   if (ref_idx) {
7696     orig_yv12 = pd->pre[0];
7697     pd->pre[0] = pd->pre[ref_idx];
7698   }
7699 
7700   // Do compound motion search on the current reference frame.
7701   av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
7702 
7703   // Use the mv result from the single mode as mv predictor.
7704   *best_mv = *this_mv;
7705 
7706   best_mv->col >>= 3;
7707   best_mv->row >>= 3;
7708 
7709 #if CONFIG_COMPOUND_SINGLEREF
7710   if (!has_second_ref(mbmi))
7711     av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
7712   else
7713 #endif  // CONFIG_COMPOUND_SINGLEREF
7714     av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
7715 
7716   // Small-range full-pixel motion search.
7717   bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
7718                                      &cpi->fn_ptr[bsize], mask, mask_stride,
7719                                      ref_idx, &ref_mv.as_mv, second_pred);
7720   if (bestsme < INT_MAX) {
7721     if (mask)
7722       bestsme =
7723           av1_get_mvpred_mask_var(x, best_mv, &ref_mv.as_mv, second_pred, mask,
7724                                   mask_stride, ref_idx, &cpi->fn_ptr[bsize], 1);
7725     else
7726       bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv.as_mv, second_pred,
7727                                       &cpi->fn_ptr[bsize], 1);
7728   }
7729 
7730   x->mv_limits = tmp_mv_limits;
7731 
7732 #if CONFIG_AMVR
7733   if (cpi->common.cur_frame_mv_precision_level) {
7734     x->best_mv.as_mv.row *= 8;
7735     x->best_mv.as_mv.col *= 8;
7736   }
7737   if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
7738 #else
7739   if (bestsme < INT_MAX) {
7740 #endif
7741     int dis; /* TODO: use dis in distortion calculation later. */
7742     unsigned int sse;
7743     bestsme = cpi->find_fractional_mv_step(
7744         x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
7745         &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
7746         x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
7747         ref_idx, pw, ph, cpi->sf.use_upsampled_references);
7748   }
7749 
7750   // Restore the pointer to the first (possibly scaled) prediction buffer.
7751   if (ref_idx) pd->pre[0] = orig_yv12;
7752 
7753   if (bestsme < INT_MAX) *this_mv = *best_mv;
7754 
7755   *rate_mv = 0;
7756 
7757   if (scaled_ref_frame) {
7758     // Restore the prediction frame pointers to their unscaled versions.
7759     int i;
7760     for (i = 0; i < MAX_MB_PLANE; i++)
7761       xd->plane[i].pre[ref_idx] = backup_yv12[i];
7762   }
7763 
7764 #if CONFIG_COMPOUND_SINGLEREF
7765   if (!has_second_ref(mbmi))
7766     av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
7767   else
7768 #endif  // CONFIG_COMPOUND_SINGLEREF
7769     av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
7770   *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
7771                               x->mvcost, MV_COST_WEIGHT);
7772 }
7773 
7774 // Wrapper for compound_single_motion_search, for the common case
7775 // where the second prediction is also an inter mode.
7776 static void compound_single_motion_search_interinter(
7777     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
7778 #if CONFIG_COMPOUND_SINGLEREF
7779     int_mv *frame_comp_mv,
7780 #endif  // CONFIG_COMPOUND_SINGLEREF
7781     int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
7782     const int block, int ref_idx) {
7783   MACROBLOCKD *xd = &x->e_mbd;
7784   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
7785 
7786 // This function should only ever be called for compound modes
7787 #if CONFIG_COMPOUND_SINGLEREF
7788   int is_singleref_comp_mode =
7789       !has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode);
7790   assert(has_second_ref(mbmi) || is_singleref_comp_mode);
7791   if (is_singleref_comp_mode && ref_idx) assert(frame_comp_mv);
7792 #else   // !CONFIG_COMPOUND_SINGLEREF
7793   assert(has_second_ref(mbmi));
7794 #endif  // CONFIG_COMPOUND_SINGLEREF
7795 
7796 // Prediction buffer from second frame.
7797 #if CONFIG_HIGHBITDEPTH
7798   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
7799   uint8_t *second_pred;
7800   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
7801     second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
7802   else
7803     second_pred = (uint8_t *)second_pred_alloc_16;
7804 #else
7805   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
7806 #endif  // CONFIG_HIGHBITDEPTH
7807 
7808 #if CONFIG_COMPOUND_SINGLEREF
7809   MV *this_mv = has_second_ref(mbmi)
7810                     ? &frame_mv[mbmi->ref_frame[ref_idx]].as_mv
7811                     : (ref_idx ? &frame_comp_mv[mbmi->ref_frame[0]].as_mv
7812                                : &frame_mv[mbmi->ref_frame[0]].as_mv);
7813   const MV *other_mv =
7814       has_second_ref(mbmi)
7815           ? &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv
7816           : (ref_idx ? &frame_mv[mbmi->ref_frame[0]].as_mv
7817                      : &frame_comp_mv[mbmi->ref_frame[0]].as_mv);
7818 #else   // !CONFIG_COMPOUND_SINGLEREF
7819   MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
7820   const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
7821 #endif  // CONFIG_COMPOUND_SINGLEREF
7822 
7823   build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
7824                           ref_idx, second_pred);
7825 
7826   compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
7827                                 second_pred, mask, mask_stride, rate_mv,
7828                                 ref_idx);
7829 }
7830 
7831 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
7832 static void do_masked_motion_search_indexed(
7833     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
7834     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
7835     int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
7836   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
7837   MACROBLOCKD *xd = &x->e_mbd;
7838   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
7839   BLOCK_SIZE sb_type = mbmi->sb_type;
7840   const uint8_t *mask;
7841   const int mask_stride = block_size_wide[bsize];
7842 
7843   mask = av1_get_compound_type_mask(comp_data, sb_type);
7844 
7845   int_mv frame_mv[TOTAL_REFS_PER_FRAME];
7846 #if CONFIG_COMPOUND_SINGLEREF
7847   int_mv frame_comp_mv[TOTAL_REFS_PER_FRAME];
7848 #endif  // CONFIG_COMPOUND_SINGLEREF
7849   MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
7850   assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
7851 
7852   frame_mv[rf[0]].as_int = cur_mv[0].as_int;
7853 #if CONFIG_COMPOUND_SINGLEREF
7854   if (!has_second_ref(mbmi))
7855     frame_comp_mv[rf[0]].as_int = cur_mv[1].as_int;
7856   else
7857 #endif  // CONFIG_COMPOUND_SINGLEREF
7858     frame_mv[rf[1]].as_int = cur_mv[1].as_int;
7859   if (which == 0 || which == 1) {
7860     compound_single_motion_search_interinter(
7861         cpi, x, bsize, frame_mv,
7862 #if CONFIG_COMPOUND_SINGLEREF
7863         has_second_ref(mbmi) ? NULL : frame_comp_mv,
7864 #endif  // CONFIG_COMPOUND_SINGLEREF
7865         mi_row, mi_col, mask, mask_stride, rate_mv, 0, which);
7866   } else if (which == 2) {
7867     joint_motion_search(cpi, x, bsize, frame_mv,
7868 #if CONFIG_COMPOUND_SINGLEREF
7869                         has_second_ref(mbmi) ? NULL : frame_comp_mv,
7870 #endif  // CONFIG_COMPOUND_SINGLEREF
7871                         mi_row, mi_col, NULL, mask, mask_stride, rate_mv, 0);
7872   }
7873   tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
7874 #if CONFIG_COMPOUND_SINGLEREF
7875   if (!has_second_ref(mbmi))
7876     tmp_mv[1].as_int = frame_comp_mv[rf[0]].as_int;
7877   else  // comp ref
7878 #endif  // CONFIG_COMPOUND_SINGLEREF
7879     tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
7880 }
7881 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
7882 
7883 // In some situations we want to discount the apparent cost of a new motion
7884 // vector. Where there is a subtle motion field and especially where there is
7885 // low spatial complexity then it can be hard to cover the cost of a new motion
7886 // vector in a single block, even if that motion vector reduces distortion.
7887 // However, once established that vector may be usable through the nearest and
7888 // near mv modes to reduce distortion in subsequent blocks and also improve
7889 // visual quality.
7890 static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
7891                                int_mv this_mv,
7892                                int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
7893                                int ref_frame) {
7894   return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
7895           (this_mv.as_int != 0) &&
7896           ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
7897            (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
7898           ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
7899            (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
7900 }
7901 
7902 #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
7903 #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
7904 
7905 // TODO(jingning): this mv clamping function should be block size dependent.
7906 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
7907   clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
7908            xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
7909            xd->mb_to_top_edge - LEFT_TOP_MARGIN,
7910            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
7911 }
7912 
7913 #if CONFIG_WEDGE
7914 static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
7915                                const BLOCK_SIZE bsize, const uint8_t *pred0,
7916                                int stride0, const uint8_t *pred1, int stride1) {
7917   const struct macroblock_plane *const p = &x->plane[0];
7918   const uint8_t *src = p->src.buf;
7919   int src_stride = p->src.stride;
7920   const int f_index = bsize - BLOCK_8X8;
7921   const int bw = block_size_wide[bsize];
7922   const int bh = block_size_high[bsize];
7923   uint32_t esq[2][4];
7924   int64_t tl, br;
7925 
7926 #if CONFIG_HIGHBITDEPTH
7927   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
7928     pred0 = CONVERT_TO_BYTEPTR(pred0);
7929     pred1 = CONVERT_TO_BYTEPTR(pred1);
7930   }
7931 #endif  // CONFIG_HIGHBITDEPTH
7932 
7933   cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
7934   cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
7935                           &esq[0][1]);
7936   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
7937                           pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
7938   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
7939                           pred0 + bh / 2 * stride0 + bw / 2, stride0,
7940                           &esq[0][3]);
7941   cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
7942   cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, stride1,
7943                           &esq[1][1]);
7944   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
7945                           pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
7946   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
7947                           pred1 + bh / 2 * stride1 + bw / 2, stride0,
7948                           &esq[1][3]);
7949 
7950   tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
7951        (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
7952   br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
7953        (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
7954   return (tl + br > 0);
7955 }
7956 #endif  // CONFIG_WEDGE
7957 
7958 #if !CONFIG_DUAL_FILTER
7959 static InterpFilter predict_interp_filter(
7960     const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
7961     const int mi_row, const int mi_col,
7962     InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
7963   InterpFilter best_filter = SWITCHABLE;
7964   const AV1_COMMON *cm = &cpi->common;
7965   const MACROBLOCKD *xd = &x->e_mbd;
7966   int bsl = mi_width_log2_lookup[bsize];
7967   int pred_filter_search =
7968       cpi->sf.cb_pred_filter_search
7969           ? (((mi_row + mi_col) >> bsl) +
7970              get_chessboard_index(cm->current_video_frame)) &
7971                 0x1
7972           : 0;
7973   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
7974   const int is_comp_pred = has_second_ref(mbmi);
7975   const int this_mode = mbmi->mode;
7976   int refs[2] = { mbmi->ref_frame[0],
7977                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
7978   if (pred_filter_search) {
7979     InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
7980     if (xd->up_available)
7981       af = av1_extract_interp_filter(
7982           xd->mi[-xd->mi_stride]->mbmi.interp_filters, 0);
7983     if (xd->left_available)
7984       lf = av1_extract_interp_filter(xd->mi[-1]->mbmi.interp_filters, 0);
7985 
7986     if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
7987       best_filter = af;
7988   }
7989   if (is_comp_pred) {
7990     if (cpi->sf.adaptive_mode_search) {
7991       switch (this_mode) {
7992         case NEAREST_NEARESTMV:
7993           if (single_filter[NEARESTMV][refs[0]] ==
7994               single_filter[NEARESTMV][refs[1]])
7995             best_filter = single_filter[NEARESTMV][refs[0]];
7996           break;
7997         case NEAR_NEARMV:
7998           if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
7999             best_filter = single_filter[NEARMV][refs[0]];
8000           break;
8001         case ZERO_ZEROMV:
8002           if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
8003             best_filter = single_filter[ZEROMV][refs[0]];
8004           break;
8005         case NEW_NEWMV:
8006           if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
8007             best_filter = single_filter[NEWMV][refs[0]];
8008           break;
8009         case NEAREST_NEWMV:
8010           if (single_filter[NEARESTMV][refs[0]] ==
8011               single_filter[NEWMV][refs[1]])
8012             best_filter = single_filter[NEARESTMV][refs[0]];
8013           break;
8014         case NEAR_NEWMV:
8015           if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
8016             best_filter = single_filter[NEARMV][refs[0]];
8017           break;
8018         case NEW_NEARESTMV:
8019           if (single_filter[NEWMV][refs[0]] ==
8020               single_filter[NEARESTMV][refs[1]])
8021             best_filter = single_filter[NEWMV][refs[0]];
8022           break;
8023         case NEW_NEARMV:
8024           if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
8025             best_filter = single_filter[NEWMV][refs[0]];
8026           break;
8027         default:
8028           if (single_filter[this_mode][refs[0]] ==
8029               single_filter[this_mode][refs[1]])
8030             best_filter = single_filter[this_mode][refs[0]];
8031           break;
8032       }
8033     }
8034   }
8035   if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
8036     best_filter = EIGHTTAP_REGULAR;
8037   }
8038   return best_filter;
8039 }
8040 #endif  // !CONFIG_DUAL_FILTER
8041 
8042 // Choose the best wedge index and sign
8043 #if CONFIG_WEDGE
8044 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
8045                           const BLOCK_SIZE bsize, const uint8_t *const p0,
8046                           const uint8_t *const p1, int *const best_wedge_sign,
8047                           int *const best_wedge_index) {
8048   const MACROBLOCKD *const xd = &x->e_mbd;
8049   const struct buf_2d *const src = &x->plane[0].src;
8050   const int bw = block_size_wide[bsize];
8051   const int bh = block_size_high[bsize];
8052   const int N = bw * bh;
8053   int rate;
8054   int64_t dist;
8055   int64_t rd, best_rd = INT64_MAX;
8056   int wedge_index;
8057   int wedge_sign;
8058   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
8059   const uint8_t *mask;
8060   uint64_t sse;
8061 #if CONFIG_HIGHBITDEPTH
8062   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
8063   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
8064 #else
8065   const int bd_round = 0;
8066 #endif  // CONFIG_HIGHBITDEPTH
8067 
8068   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
8069   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
8070   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
8071   DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
8072 
8073   int64_t sign_limit;
8074 
8075 #if CONFIG_HIGHBITDEPTH
8076   if (hbd) {
8077     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
8078                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
8079     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
8080                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
8081     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
8082                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
8083   } else  // NOLINT
8084 #endif    // CONFIG_HIGHBITDEPTH
8085   {
8086     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
8087     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
8088     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
8089   }
8090 
8091   sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
8092                 (int64_t)aom_sum_squares_i16(r1, N)) *
8093                (1 << WEDGE_WEIGHT_BITS) / 2;
8094 
8095   if (N < 64)
8096     av1_wedge_compute_delta_squares_c(ds, r0, r1, N);
8097   else
8098     av1_wedge_compute_delta_squares(ds, r0, r1, N);
8099 
8100   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
8101     mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
8102 
8103     // TODO(jingning): Make sse2 functions support N = 16 case
8104     if (N < 64)
8105       wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
8106     else
8107       wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
8108 
8109     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
8110     if (N < 64)
8111       sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
8112     else
8113       sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
8114     sse = ROUND_POWER_OF_TWO(sse, bd_round);
8115 
8116     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
8117     rd = RDCOST(x->rdmult, rate, dist);
8118 
8119     if (rd < best_rd) {
8120       *best_wedge_index = wedge_index;
8121       *best_wedge_sign = wedge_sign;
8122       best_rd = rd;
8123     }
8124   }
8125 
8126   return best_rd;
8127 }
8128 
8129 // Choose the best wedge index the specified sign
8130 static int64_t pick_wedge_fixed_sign(
8131     const AV1_COMP *const cpi, const MACROBLOCK *const x,
8132     const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
8133     const int wedge_sign, int *const best_wedge_index) {
8134   const MACROBLOCKD *const xd = &x->e_mbd;
8135   const struct buf_2d *const src = &x->plane[0].src;
8136   const int bw = block_size_wide[bsize];
8137   const int bh = block_size_high[bsize];
8138   const int N = bw * bh;
8139   int rate;
8140   int64_t dist;
8141   int64_t rd, best_rd = INT64_MAX;
8142   int wedge_index;
8143   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
8144   const uint8_t *mask;
8145   uint64_t sse;
8146 #if CONFIG_HIGHBITDEPTH
8147   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
8148   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
8149 #else
8150   const int bd_round = 0;
8151 #endif  // CONFIG_HIGHBITDEPTH
8152 
8153   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
8154   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
8155 
8156 #if CONFIG_HIGHBITDEPTH
8157   if (hbd) {
8158     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
8159                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
8160     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
8161                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
8162   } else  // NOLINT
8163 #endif    // CONFIG_HIGHBITDEPTH
8164   {
8165     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
8166     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
8167   }
8168 
8169   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
8170     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
8171     if (N < 64)
8172       sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
8173     else
8174       sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
8175     sse = ROUND_POWER_OF_TWO(sse, bd_round);
8176 
8177     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
8178     rd = RDCOST(x->rdmult, rate, dist);
8179 
8180     if (rd < best_rd) {
8181       *best_wedge_index = wedge_index;
8182       best_rd = rd;
8183     }
8184   }
8185 
8186   return best_rd;
8187 }
8188 
8189 static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
8190                                      MACROBLOCK *const x,
8191                                      const BLOCK_SIZE bsize,
8192                                      const uint8_t *const p0,
8193                                      const uint8_t *const p1) {
8194   MACROBLOCKD *const xd = &x->e_mbd;
8195   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8196   const int bw = block_size_wide[bsize];
8197 
8198   int64_t rd;
8199   int wedge_index = -1;
8200   int wedge_sign = 0;
8201 
8202   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
8203   assert(cpi->common.allow_masked_compound);
8204 
8205   if (cpi->sf.fast_wedge_sign_estimate) {
8206     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
8207     rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
8208   } else {
8209     rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
8210   }
8211 
8212   mbmi->wedge_sign = wedge_sign;
8213   mbmi->wedge_index = wedge_index;
8214   return rd;
8215 }
8216 #endif  // CONFIG_WEDGE
8217 
8218 #if CONFIG_COMPOUND_SEGMENT
8219 static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
8220                                    MACROBLOCK *const x, const BLOCK_SIZE bsize,
8221                                    const uint8_t *const p0,
8222                                    const uint8_t *const p1) {
8223   MACROBLOCKD *const xd = &x->e_mbd;
8224   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8225   const struct buf_2d *const src = &x->plane[0].src;
8226   const int bw = block_size_wide[bsize];
8227   const int bh = block_size_high[bsize];
8228   const int N = bw * bh;
8229   int rate;
8230   uint64_t sse;
8231   int64_t dist;
8232   int64_t rd0;
8233   SEG_MASK_TYPE cur_mask_type;
8234   int64_t best_rd = INT64_MAX;
8235   SEG_MASK_TYPE best_mask_type = 0;
8236 #if CONFIG_HIGHBITDEPTH
8237   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
8238   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
8239 #else
8240   const int bd_round = 0;
8241 #endif  // CONFIG_HIGHBITDEPTH
8242   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
8243   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
8244   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
8245 
8246 #if CONFIG_HIGHBITDEPTH
8247   if (hbd) {
8248     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
8249                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
8250     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
8251                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
8252     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
8253                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
8254   } else  // NOLINT
8255 #endif    // CONFIG_HIGHBITDEPTH
8256   {
8257     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
8258     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
8259     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
8260   }
8261 
8262   // try each mask type and its inverse
8263   for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) {
8264 // build mask and inverse
8265 #if CONFIG_HIGHBITDEPTH
8266     if (hbd)
8267       build_compound_seg_mask_highbd(
8268           xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
8269           CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
8270     else
8271 #endif  // CONFIG_HIGHBITDEPTH
8272       build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw,
8273                               bsize, bh, bw);
8274 
8275     // compute rd for mask
8276     sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
8277     sse = ROUND_POWER_OF_TWO(sse, bd_round);
8278 
8279     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
8280     rd0 = RDCOST(x->rdmult, rate, dist);
8281 
8282     if (rd0 < best_rd) {
8283       best_mask_type = cur_mask_type;
8284       best_rd = rd0;
8285     }
8286   }
8287 
8288   // make final mask
8289   mbmi->mask_type = best_mask_type;
8290 #if CONFIG_HIGHBITDEPTH
8291   if (hbd)
8292     build_compound_seg_mask_highbd(
8293         xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw,
8294         CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
8295   else
8296 #endif  // CONFIG_HIGHBITDEPTH
8297     build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw,
8298                             bsize, bh, bw);
8299 
8300   return best_rd;
8301 }
8302 #endif  // CONFIG_COMPOUND_SEGMENT
8303 
8304 #if CONFIG_WEDGE && CONFIG_INTERINTRA
8305 static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
8306                                      const MACROBLOCK *const x,
8307                                      const BLOCK_SIZE bsize,
8308                                      const uint8_t *const p0,
8309                                      const uint8_t *const p1) {
8310   const MACROBLOCKD *const xd = &x->e_mbd;
8311   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8312 
8313   int64_t rd;
8314   int wedge_index = -1;
8315 
8316   assert(is_interintra_wedge_used(bsize));
8317   assert(cpi->common.allow_interintra_compound);
8318 
8319   rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
8320 
8321   mbmi->interintra_wedge_sign = 0;
8322   mbmi->interintra_wedge_index = wedge_index;
8323   return rd;
8324 }
8325 #endif  // CONFIG_WEDGE && CONFIG_INTERINTRA
8326 
8327 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
8328 static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
8329                                     const BLOCK_SIZE bsize,
8330                                     const uint8_t *const p0,
8331                                     const uint8_t *const p1) {
8332   const COMPOUND_TYPE compound_type =
8333       x->e_mbd.mi[0]->mbmi.interinter_compound_type;
8334   switch (compound_type) {
8335 #if CONFIG_WEDGE
8336     case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
8337 #endif  // CONFIG_WEDGE
8338 #if CONFIG_COMPOUND_SEGMENT
8339     case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1);
8340 #endif  // CONFIG_COMPOUND_SEGMENT
8341     default: assert(0); return 0;
8342   }
8343 }
8344 
8345 static int interinter_compound_motion_search(
8346     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
8347     const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
8348   MACROBLOCKD *const xd = &x->e_mbd;
8349   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8350   int_mv tmp_mv[2];
8351   int tmp_rate_mv = 0;
8352   const INTERINTER_COMPOUND_DATA compound_data = {
8353 #if CONFIG_WEDGE
8354     mbmi->wedge_index,
8355     mbmi->wedge_sign,
8356 #endif  // CONFIG_WEDGE
8357 #if CONFIG_COMPOUND_SEGMENT
8358     mbmi->mask_type,
8359     xd->seg_mask,
8360 #endif  // CONFIG_COMPOUND_SEGMENT
8361     mbmi->interinter_compound_type
8362   };
8363 #if CONFIG_COMPOUND_SINGLEREF
8364   // NOTE: Mode is needed to identify the compound mode prediction, regardless
8365   //       of comp refs or single ref.
8366   mbmi->mode = this_mode;
8367 #endif  // CONFIG_COMPOUND_SINGLEREF
8368 
8369   if (this_mode == NEW_NEWMV
8370 #if CONFIG_COMPOUND_SINGLEREF
8371       || this_mode == SR_NEW_NEWMV
8372 #endif  // CONFIG_COMPOUND_SINGLEREF
8373       ) {
8374     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
8375                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
8376     mbmi->mv[0].as_int = tmp_mv[0].as_int;
8377     mbmi->mv[1].as_int = tmp_mv[1].as_int;
8378   } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
8379     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
8380                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
8381     mbmi->mv[0].as_int = tmp_mv[0].as_int;
8382   } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
8383 #if CONFIG_COMPOUND_SINGLEREF
8384              // || this_mode == SR_NEAREST_NEWMV
8385              || this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV
8386 #endif  // CONFIG_COMPOUND_SINGLEREF
8387              ) {
8388     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
8389                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
8390     mbmi->mv[1].as_int = tmp_mv[1].as_int;
8391   }
8392   return tmp_rate_mv;
8393 }
8394 
8395 static int64_t build_and_cost_compound_type(
8396     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
8397     const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
8398     BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
8399     int *strides, int mi_row, int mi_col) {
8400   const AV1_COMMON *const cm = &cpi->common;
8401   MACROBLOCKD *xd = &x->e_mbd;
8402   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8403   int rate_sum;
8404   int64_t dist_sum;
8405   int64_t best_rd_cur = INT64_MAX;
8406   int64_t rd = INT64_MAX;
8407   int tmp_skip_txfm_sb;
8408   int64_t tmp_skip_sse_sb;
8409   const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
8410 
8411   best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
8412   best_rd_cur += RDCOST(x->rdmult, rs2 + rate_mv, 0);
8413 
8414   if (have_newmv_in_inter_mode(this_mode) &&
8415       use_masked_motion_search(compound_type)) {
8416     *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
8417                                                      this_mode, mi_row, mi_col);
8418     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
8419     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
8420                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
8421     rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
8422     if (rd >= best_rd_cur) {
8423       mbmi->mv[0].as_int = cur_mv[0].as_int;
8424       mbmi->mv[1].as_int = cur_mv[1].as_int;
8425       *out_rate_mv = rate_mv;
8426       av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
8427 #if CONFIG_SUPERTX
8428                                                0, 0,
8429 #endif  // CONFIG_SUPERTX
8430                                                preds0, strides, preds1,
8431                                                strides);
8432     }
8433     av1_subtract_plane(x, bsize, 0);
8434     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
8435                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
8436     if (rd != INT64_MAX)
8437       rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
8438     best_rd_cur = rd;
8439 
8440   } else {
8441     av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
8442 #if CONFIG_SUPERTX
8443                                              0, 0,
8444 #endif  // CONFIG_SUPERTX
8445                                              preds0, strides, preds1, strides);
8446     av1_subtract_plane(x, bsize, 0);
8447     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
8448                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
8449     if (rd != INT64_MAX)
8450       rd = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
8451     best_rd_cur = rd;
8452   }
8453   return best_rd_cur;
8454 }
8455 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
8456 
8457 typedef struct {
8458 #if CONFIG_MOTION_VAR
8459   // Inter prediction buffers and respective strides
8460   uint8_t *above_pred_buf[MAX_MB_PLANE];
8461   int above_pred_stride[MAX_MB_PLANE];
8462   uint8_t *left_pred_buf[MAX_MB_PLANE];
8463   int left_pred_stride[MAX_MB_PLANE];
8464 #endif  // CONFIG_MOTION_VAR
8465   int_mv *single_newmv;
8466   // Pointer to array of motion vectors to use for each ref and their rates
8467   // Should point to first of 2 arrays in 2D array
8468   int *single_newmv_rate;
8469   // Pointer to array of predicted rate-distortion
8470   // Should point to first of 2 arrays in 2D array
8471   int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
8472   InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
8473 } HandleInterModeArgs;
8474 
8475 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
8476                             const BLOCK_SIZE bsize,
8477                             int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
8478 #if CONFIG_COMPOUND_SINGLEREF
8479                             int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
8480 #endif  // CONFIG_COMPOUND_SINGLEREF
8481                             const int mi_row, const int mi_col,
8482                             int *const rate_mv, int_mv *const single_newmv,
8483                             HandleInterModeArgs *const args) {
8484   const MACROBLOCKD *const xd = &x->e_mbd;
8485   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8486   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
8487   const int is_comp_pred = has_second_ref(mbmi);
8488   const PREDICTION_MODE this_mode = mbmi->mode;
8489   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
8490   int_mv *const frame_mv = mode_mv[this_mode];
8491 #if CONFIG_COMPOUND_SINGLEREF
8492   int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
8493 #endif  // CONFIG_COMPOUND_SINGLEREF
8494   const int refs[2] = { mbmi->ref_frame[0],
8495                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
8496   int i;
8497 
8498   (void)args;
8499 
8500   if (is_comp_pred) {
8501     for (i = 0; i < 2; ++i) {
8502       single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
8503     }
8504 
8505     if (this_mode == NEW_NEWMV) {
8506       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
8507       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
8508 
8509       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
8510         joint_motion_search(cpi, x, bsize, frame_mv,
8511 #if CONFIG_COMPOUND_SINGLEREF
8512                             NULL,  // int_mv *frame_comp_mv
8513 #endif                             // CONFIG_COMPOUND_SINGLEREF
8514                             mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
8515       } else {
8516         *rate_mv = 0;
8517         for (i = 0; i < 2; ++i) {
8518           av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
8519           *rate_mv += av1_mv_bit_cost(
8520               &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
8521               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
8522         }
8523       }
8524     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
8525       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
8526       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
8527         frame_mv[refs[0]].as_int =
8528             mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
8529         compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
8530 #if CONFIG_COMPOUND_SINGLEREF
8531                                                  NULL,
8532 #endif  // CONFIG_COMPOUND_SINGLEREF
8533                                                  mi_row, mi_col, NULL, 0,
8534                                                  rate_mv, 0, 1);
8535       } else {
8536         av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
8537         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
8538                                    &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
8539                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
8540       }
8541     } else {
8542       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
8543       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
8544       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
8545         frame_mv[refs[1]].as_int =
8546             mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
8547         compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
8548 #if CONFIG_COMPOUND_SINGLEREF
8549                                                  NULL,
8550 #endif  // CONFIG_COMPOUND_SINGLEREF
8551                                                  mi_row, mi_col, NULL, 0,
8552                                                  rate_mv, 0, 0);
8553       } else {
8554         av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
8555         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
8556                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
8557                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
8558       }
8559     }
8560 #if CONFIG_COMPOUND_SINGLEREF
8561   } else if (is_inter_singleref_comp_mode(this_mode)) {
8562     // Single ref comp mode
8563     const int mode0 = compound_ref0_mode(this_mode);
8564 
8565     single_newmv[refs[0]].as_int = args->single_newmv[refs[0]].as_int;
8566     frame_mv[refs[0]].as_int = (mode0 == NEWMV)
8567                                    ? single_newmv[refs[0]].as_int
8568                                    : mode_mv[mode0][refs[0]].as_int;
8569     assert(compound_ref1_mode(this_mode) == NEWMV);
8570     frame_comp_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
8571 
8572     if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
8573       if (this_mode == SR_NEW_NEWMV) {
8574         joint_motion_search(cpi, x, bsize, frame_mv, frame_comp_mv, mi_row,
8575                             mi_col, NULL, NULL, 0, rate_mv, 0);
8576       } else {
8577         assert(  // this_mode == SR_NEAREST_NEWMV ||
8578             this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV);
8579         compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
8580                                                  frame_comp_mv, mi_row, mi_col,
8581                                                  NULL, 0, rate_mv, 0, 1);
8582       }
8583     } else {
8584       *rate_mv = 0;
8585       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
8586       if (mode0 == NEWMV)
8587         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
8588                                     &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
8589                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
8590       *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
8591                                   &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
8592                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
8593     }
8594 #endif  // CONFIG_COMPOUND_SINGLEREF
8595   } else {
8596     if (is_comp_interintra_pred) {
8597       x->best_mv = args->single_newmv[refs[0]];
8598       *rate_mv = args->single_newmv_rate[refs[0]];
8599     } else {
8600       single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
8601       args->single_newmv[refs[0]] = x->best_mv;
8602       args->single_newmv_rate[refs[0]] = *rate_mv;
8603     }
8604 
8605     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
8606 
8607     frame_mv[refs[0]] = x->best_mv;
8608     xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
8609 
8610     // Estimate the rate implications of a new mv but discount this
8611     // under certain circumstances where we want to help initiate a weak
8612     // motion field, where the distortion gain for a single block may not
8613     // be enough to overcome the cost of a new mv.
8614     if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
8615       *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
8616     }
8617   }
8618 
8619   return 0;
8620 }
8621 
8622 int64_t interpolation_filter_search(
8623     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
8624     int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
8625     BUFFER_SET *const orig_dst,
8626     InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME],
8627     int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
8628     int64_t *const skip_sse_sb) {
8629   const AV1_COMMON *cm = &cpi->common;
8630   MACROBLOCKD *const xd = &x->e_mbd;
8631   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8632   int i;
8633   int tmp_rate;
8634   int64_t tmp_dist;
8635 
8636   (void)single_filter;
8637 
8638   InterpFilter assign_filter = SWITCHABLE;
8639 
8640   if (cm->interp_filter == SWITCHABLE) {
8641 #if !CONFIG_DUAL_FILTER
8642     assign_filter = av1_is_interp_needed(xd)
8643                         ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
8644                                                 single_filter)
8645                         : cm->interp_filter;
8646 #endif  // !CONFIG_DUAL_FILTER
8647   } else {
8648     assign_filter = cm->interp_filter;
8649   }
8650 
8651   set_default_interp_filters(mbmi, assign_filter);
8652 
8653   *switchable_rate = av1_get_switchable_rate(cm, x, xd);
8654   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
8655   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
8656                   skip_txfm_sb, skip_sse_sb);
8657   *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
8658 
8659   if (assign_filter == SWITCHABLE) {
8660     // do interp_filter search
8661     if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) {
8662 #if CONFIG_DUAL_FILTER
8663       const int filter_set_size = DUAL_FILTER_SET_SIZE;
8664 #else
8665       const int filter_set_size = SWITCHABLE_FILTERS;
8666 #endif  // CONFIG_DUAL_FILTER
8667       int best_in_temp = 0;
8668       InterpFilters best_filters = mbmi->interp_filters;
8669       restore_dst_buf(xd, *tmp_dst);
8670       // EIGHTTAP_REGULAR mode is calculated beforehand
8671       for (i = 1; i < filter_set_size; ++i) {
8672         int tmp_skip_sb = 0;
8673         int64_t tmp_skip_sse = INT64_MAX;
8674         int tmp_rs;
8675         int64_t tmp_rd;
8676 #if CONFIG_DUAL_FILTER
8677         mbmi->interp_filters =
8678             av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
8679 #else
8680         mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
8681 #endif  // CONFIG_DUAL_FILTER
8682         tmp_rs = av1_get_switchable_rate(cm, x, xd);
8683         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
8684         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
8685                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
8686         tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
8687 
8688         if (tmp_rd < *rd) {
8689           *rd = tmp_rd;
8690           *switchable_rate = av1_get_switchable_rate(cm, x, xd);
8691           best_filters = mbmi->interp_filters;
8692           *skip_txfm_sb = tmp_skip_sb;
8693           *skip_sse_sb = tmp_skip_sse;
8694           best_in_temp = !best_in_temp;
8695           if (best_in_temp) {
8696             restore_dst_buf(xd, *orig_dst);
8697           } else {
8698             restore_dst_buf(xd, *tmp_dst);
8699           }
8700         }
8701       }
8702       if (best_in_temp) {
8703         restore_dst_buf(xd, *tmp_dst);
8704       } else {
8705         restore_dst_buf(xd, *orig_dst);
8706       }
8707       mbmi->interp_filters = best_filters;
8708     } else {
8709       assert(mbmi->interp_filters ==
8710              av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
8711     }
8712   }
8713 
8714   return 0;
8715 }
8716 
8717 #if CONFIG_DUAL_FILTER
8718 static InterpFilters condition_interp_filters_on_mv(
8719     InterpFilters interp_filters, const MACROBLOCKD *xd) {
8720   InterpFilter filters[2];
8721   for (int i = 0; i < 2; ++i)
8722     filters[i] = (has_subpel_mv_component(xd->mi[0], xd, i))
8723                      ? av1_extract_interp_filter(interp_filters, i)
8724                      : EIGHTTAP_REGULAR;
8725 
8726   return av1_make_interp_filters(filters[0], filters[1]);
8727 }
8728 #endif
8729 
8730 // TODO(afergs): Refactor the MBMI references in here - there's four
8731 // TODO(afergs): Refactor optional args - add them to a struct or remove
8732 static int64_t motion_mode_rd(
8733     const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
8734     RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
8735     int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
8736     int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
8737     const int *refs, int rate_mv,
8738 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8739     // only used when WARPED_MOTION is on?
8740     int_mv *const single_newmv, int rate2_bmc_nocoeff,
8741     MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
8742 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8743     int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
8744   const AV1_COMMON *const cm = &cpi->common;
8745   MACROBLOCKD *xd = &x->e_mbd;
8746   MODE_INFO *mi = xd->mi[0];
8747   MB_MODE_INFO *mbmi = &mi->mbmi;
8748   const int is_comp_pred = has_second_ref(mbmi);
8749   const PREDICTION_MODE this_mode = mbmi->mode;
8750 
8751   (void)mode_mv;
8752   (void)mi_row;
8753   (void)mi_col;
8754   (void)args;
8755   (void)refs;
8756   (void)rate_mv;
8757   (void)is_comp_pred;
8758   (void)this_mode;
8759 #if !CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
8760   (void)single_newmv;
8761 #endif
8762 
8763 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8764   MOTION_MODE motion_mode, last_motion_mode_allowed;
8765   int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
8766   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
8767   MB_MODE_INFO base_mbmi, best_mbmi;
8768 #if CONFIG_VAR_TX
8769   uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
8770 #endif  // CONFIG_VAR_TX
8771 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8772 
8773 #if CONFIG_WARPED_MOTION
8774 #if WARPED_MOTION_SORT_SAMPLES
8775   int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
8776   int pts_mv0[SAMPLES_ARRAY_SIZE];
8777   int total_samples;
8778 #else
8779   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
8780 #endif  // WARPED_MOTION_SORT_SAMPLES
8781 #endif  // CONFIG_WARPED_MOTION
8782 
8783 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8784   av1_invalid_rd_stats(&best_rd_stats);
8785 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8786 
8787   if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
8788 #if CONFIG_WARPED_MOTION
8789   aom_clear_system_state();
8790 #if WARPED_MOTION_SORT_SAMPLES
8791   mbmi->num_proj_ref[0] =
8792       findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0, pts_mv0);
8793   total_samples = mbmi->num_proj_ref[0];
8794 #else
8795   mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
8796 #endif  // WARPED_MOTION_SORT_SAMPLES
8797   best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
8798 #endif  // CONFIG_WARPED_MOTION
8799 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8800   rate2_nocoeff = rd_stats->rate;
8801   last_motion_mode_allowed = motion_mode_allowed(
8802 #if CONFIG_GLOBAL_MOTION
8803       0, xd->global_motion,
8804 #endif  // CONFIG_GLOBAL_MOTION
8805 #if CONFIG_WARPED_MOTION
8806       xd,
8807 #endif
8808       mi);
8809   base_mbmi = *mbmi;
8810 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8811 
8812 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8813   int64_t best_rd = INT64_MAX;
8814   for (motion_mode = SIMPLE_TRANSLATION;
8815        motion_mode <= last_motion_mode_allowed; motion_mode++) {
8816     int64_t tmp_rd = INT64_MAX;
8817     int tmp_rate;
8818     int64_t tmp_dist;
8819     int tmp_rate2 =
8820         motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
8821 
8822 #if CONFIG_NCOBMC_ADAPT_WEIGHT
8823     // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
8824     // right now since it requires mvs from all neighboring blocks. We will
8825     // check if this mode is beneficial after all the mv's in the current
8826     // superblock are selected.
8827     if (motion_mode == NCOBMC_ADAPT_WEIGHT) continue;
8828 #endif
8829 
8830     *mbmi = base_mbmi;
8831     mbmi->motion_mode = motion_mode;
8832 #if CONFIG_MOTION_VAR
8833     if (mbmi->motion_mode == OBMC_CAUSAL) {
8834       *mbmi = *best_bmc_mbmi;
8835       mbmi->motion_mode = OBMC_CAUSAL;
8836       if (!is_comp_pred &&
8837 #if CONFIG_COMPOUND_SINGLEREF
8838           !is_inter_singleref_comp_mode(this_mode) &&
8839 #endif  // CONFIG_COMPOUND_SINGLEREF
8840           have_newmv_in_inter_mode(this_mode)) {
8841         int tmp_rate_mv = 0;
8842 
8843         single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
8844         mbmi->mv[0].as_int = x->best_mv.as_int;
8845         if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
8846                                 refs[0])) {
8847           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
8848         }
8849         tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
8850 #if CONFIG_DUAL_FILTER
8851         mbmi->interp_filters =
8852             condition_interp_filters_on_mv(mbmi->interp_filters, xd);
8853 #endif  // CONFIG_DUAL_FILTER
8854         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
8855       } else {
8856         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
8857       }
8858       av1_build_obmc_inter_prediction(
8859           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
8860           args->left_pred_buf, args->left_pred_stride);
8861       model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
8862                       &tmp_dist, skip_txfm_sb, skip_sse_sb);
8863     }
8864 #endif  // CONFIG_MOTION_VAR
8865 
8866 #if CONFIG_WARPED_MOTION
8867     if (mbmi->motion_mode == WARPED_CAUSAL) {
8868 #if WARPED_MOTION_SORT_SAMPLES
8869       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
8870 #endif  // WARPED_MOTION_SORT_SAMPLES
8871       *mbmi = *best_bmc_mbmi;
8872       mbmi->motion_mode = WARPED_CAUSAL;
8873       mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
8874       mbmi->interp_filters = av1_broadcast_interp_filter(
8875           av1_unswitchable_filter(cm->interp_filter));
8876 
8877 #if WARPED_MOTION_SORT_SAMPLES
8878       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
8879       memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
8880       // Rank the samples by motion vector difference
8881       if (mbmi->num_proj_ref[0] > 1) {
8882         mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
8883                                             pts_inref, mbmi->num_proj_ref[0]);
8884         best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
8885       }
8886 #endif  // WARPED_MOTION_SORT_SAMPLES
8887 
8888       if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
8889                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
8890                            &mbmi->wm_params[0], mi_row, mi_col)) {
8891         // Refine MV for NEWMV mode
8892         if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
8893           int tmp_rate_mv = 0;
8894           const int_mv mv0 = mbmi->mv[0];
8895           WarpedMotionParams wm_params0 = mbmi->wm_params[0];
8896 #if WARPED_MOTION_SORT_SAMPLES
8897           int num_proj_ref0 = mbmi->num_proj_ref[0];
8898 
8899           // Refine MV in a small range.
8900           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
8901                                pts_mv0, total_samples);
8902 #else
8903           // Refine MV in a small range.
8904           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
8905 #endif  // WARPED_MOTION_SORT_SAMPLES
8906 
8907           // Keep the refined MV and WM parameters.
8908           if (mv0.as_int != mbmi->mv[0].as_int) {
8909             const int ref = refs[0];
8910             const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
8911 
8912             tmp_rate_mv =
8913                 av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv, x->nmvjointcost,
8914                                 x->mvcost, MV_COST_WEIGHT);
8915 
8916             if (cpi->sf.adaptive_motion_search)
8917               x->pred_mv[ref] = mbmi->mv[0].as_mv;
8918 
8919             single_newmv[ref] = mbmi->mv[0];
8920 
8921             if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
8922                                     refs[0])) {
8923               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
8924             }
8925 #if WARPED_MOTION_SORT_SAMPLES
8926             best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
8927 #endif  // WARPED_MOTION_SORT_SAMPLES
8928             tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
8929 #if CONFIG_DUAL_FILTER
8930             mbmi->interp_filters =
8931                 condition_interp_filters_on_mv(mbmi->interp_filters, xd);
8932 #endif  // CONFIG_DUAL_FILTER
8933           } else {
8934             // Restore the old MV and WM parameters.
8935             mbmi->mv[0] = mv0;
8936             mbmi->wm_params[0] = wm_params0;
8937 #if WARPED_MOTION_SORT_SAMPLES
8938             mbmi->num_proj_ref[0] = num_proj_ref0;
8939 #endif  // WARPED_MOTION_SORT_SAMPLES
8940           }
8941         }
8942 
8943         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
8944         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
8945                         &tmp_dist, skip_txfm_sb, skip_sse_sb);
8946       } else {
8947         continue;
8948       }
8949     }
8950 #endif  // CONFIG_WARPED_MOTION
8951     x->skip = 0;
8952 
8953     rd_stats->dist = 0;
8954     rd_stats->sse = 0;
8955     rd_stats->skip = 1;
8956     rd_stats->rate = tmp_rate2;
8957     if (last_motion_mode_allowed > SIMPLE_TRANSLATION) {
8958 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
8959       if (last_motion_mode_allowed == WARPED_CAUSAL)
8960 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
8961         rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
8962 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
8963       else
8964         rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
8965 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
8966     }
8967 #if CONFIG_WARPED_MOTION
8968     if (mbmi->motion_mode == WARPED_CAUSAL) {
8969       rd_stats->rate -= rs;
8970     }
8971 #endif  // CONFIG_WARPED_MOTION
8972 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8973     if (!*skip_txfm_sb) {
8974       int64_t rdcosty = INT64_MAX;
8975       int is_cost_valid_uv = 0;
8976 
8977       // cost and distortion
8978       av1_subtract_plane(x, bsize, 0);
8979 #if CONFIG_VAR_TX
8980       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
8981         select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
8982       } else {
8983         int idx, idy;
8984         super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
8985         for (idy = 0; idy < xd->n8_h; ++idy)
8986           for (idx = 0; idx < xd->n8_w; ++idx)
8987             mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
8988         memset(x->blk_skip[0], rd_stats_y->skip,
8989                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
8990       }
8991 #else
8992     /* clang-format off */
8993       super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
8994 /* clang-format on */
8995 #endif  // CONFIG_VAR_TX
8996 
8997       if (rd_stats_y->rate == INT_MAX) {
8998         av1_invalid_rd_stats(rd_stats);
8999 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9000         if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
9001           continue;
9002         } else {
9003 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9004           restore_dst_buf(xd, *orig_dst);
9005           return INT64_MAX;
9006 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9007         }
9008 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9009       }
9010 
9011       av1_merge_rd_stats(rd_stats, rd_stats_y);
9012 
9013       rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
9014       rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
9015 /* clang-format off */
9016 #if CONFIG_VAR_TX
9017       is_cost_valid_uv =
9018           inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
9019 #else
9020       is_cost_valid_uv =
9021           super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
9022 #endif  // CONFIG_VAR_TX
9023       if (!is_cost_valid_uv) {
9024 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9025         continue;
9026 #else
9027         restore_dst_buf(xd, *orig_dst);
9028         return INT64_MAX;
9029 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9030       }
9031       /* clang-format on */
9032       av1_merge_rd_stats(rd_stats, rd_stats_uv);
9033 #if CONFIG_RD_DEBUG
9034       // record transform block coefficient cost
9035       // TODO(angiebird): So far rd_debug tool only detects discrepancy of
9036       // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
9037       // here because we already collect the coefficient cost. Move this part to
9038       // other place when we need to compare non-coefficient cost.
9039       mbmi->rd_stats = *rd_stats;
9040 #endif  // CONFIG_RD_DEBUG
9041 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9042       if (rd_stats->skip) {
9043         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
9044         rd_stats_y->rate = 0;
9045         rd_stats_uv->rate = 0;
9046         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
9047         mbmi->skip = 0;
9048         // here mbmi->skip temporarily plays a role as what this_skip2 does
9049       } else if (!xd->lossless[mbmi->segment_id] &&
9050                  (RDCOST(x->rdmult,
9051                          rd_stats_y->rate + rd_stats_uv->rate +
9052                              av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
9053                          rd_stats->dist) >=
9054                   RDCOST(x->rdmult, av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
9055                          rd_stats->sse))) {
9056         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
9057         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
9058         rd_stats->dist = rd_stats->sse;
9059         rd_stats_y->rate = 0;
9060         rd_stats_uv->rate = 0;
9061         mbmi->skip = 1;
9062       } else {
9063         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
9064         mbmi->skip = 0;
9065       }
9066       *disable_skip = 0;
9067 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9068     } else {
9069       x->skip = 1;
9070       *disable_skip = 1;
9071       mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
9072 
9073 // The cost of skip bit needs to be added.
9074 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9075       mbmi->skip = 0;
9076 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9077       rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
9078 
9079       rd_stats->dist = *skip_sse_sb;
9080       rd_stats->sse = *skip_sse_sb;
9081       rd_stats_y->rate = 0;
9082       rd_stats_uv->rate = 0;
9083       rd_stats->skip = 1;
9084     }
9085 
9086 #if CONFIG_GLOBAL_MOTION
9087     if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
9088       if (is_nontrans_global_motion(xd)) {
9089         rd_stats->rate -= rs;
9090         mbmi->interp_filters = av1_broadcast_interp_filter(
9091             av1_unswitchable_filter(cm->interp_filter));
9092       }
9093     }
9094 #endif  // CONFIG_GLOBAL_MOTION
9095 
9096 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9097     tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
9098     if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
9099       best_mbmi = *mbmi;
9100       best_rd = tmp_rd;
9101       best_rd_stats = *rd_stats;
9102       best_rd_stats_y = *rd_stats_y;
9103       best_rd_stats_uv = *rd_stats_uv;
9104 #if CONFIG_VAR_TX
9105       for (int i = 0; i < MAX_MB_PLANE; ++i)
9106         memcpy(best_blk_skip[i], x->blk_skip[i],
9107                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
9108 #endif  // CONFIG_VAR_TX
9109       best_xskip = x->skip;
9110       best_disable_skip = *disable_skip;
9111     }
9112   }
9113 
9114   if (best_rd == INT64_MAX) {
9115     av1_invalid_rd_stats(rd_stats);
9116     restore_dst_buf(xd, *orig_dst);
9117     return INT64_MAX;
9118   }
9119   *mbmi = best_mbmi;
9120   *rd_stats = best_rd_stats;
9121   *rd_stats_y = best_rd_stats_y;
9122   *rd_stats_uv = best_rd_stats_uv;
9123 #if CONFIG_VAR_TX
9124   for (int i = 0; i < MAX_MB_PLANE; ++i)
9125     memcpy(x->blk_skip[i], best_blk_skip[i],
9126            sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
9127 #endif  // CONFIG_VAR_TX
9128   x->skip = best_xskip;
9129   *disable_skip = best_disable_skip;
9130 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9131 
9132   restore_dst_buf(xd, *orig_dst);
9133   return 0;
9134 }
9135 
9136 static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
9137                                  BLOCK_SIZE bsize, RD_STATS *rd_stats,
9138                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
9139                                  int *disable_skip,
9140                                  int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
9141 #if CONFIG_COMPOUND_SINGLEREF
9142                                  int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
9143 #endif  // CONFIG_COMPOUND_SINGLEREF
9144                                  int mi_row, int mi_col,
9145                                  HandleInterModeArgs *args,
9146                                  const int64_t ref_best_rd) {
9147   const AV1_COMMON *cm = &cpi->common;
9148   MACROBLOCKD *xd = &x->e_mbd;
9149   MODE_INFO *mi = xd->mi[0];
9150   MB_MODE_INFO *mbmi = &mi->mbmi;
9151   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
9152   const int is_comp_pred = has_second_ref(mbmi);
9153   const int this_mode = mbmi->mode;
9154 #if CONFIG_COMPOUND_SINGLEREF
9155   const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
9156 #endif  // CONFIG_COMPOUND_SINGLEREF
9157   int_mv *frame_mv = mode_mv[this_mode];
9158 #if CONFIG_COMPOUND_SINGLEREF
9159   // The comp mv for the compound mode in single ref
9160   int_mv *frame_comp_mv = mode_comp_mv[this_mode];
9161 #endif  // CONFIG_COMPOUND_SINGLEREF
9162   int i;
9163   int refs[2] = { mbmi->ref_frame[0],
9164                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
9165   int_mv cur_mv[2];
9166   int rate_mv = 0;
9167   int pred_exists = 1;
9168 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
9169   const int bw = block_size_wide[bsize];
9170 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
9171   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
9172 #if CONFIG_INTERINTRA
9173   const int *const interintra_mode_cost =
9174       x->interintra_mode_cost[size_group_lookup[bsize]];
9175 #endif  // CONFIG_INTERINTRA
9176   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
9177   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
9178 #if CONFIG_HIGHBITDEPTH
9179   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
9180 #else
9181   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
9182 #endif  // CONFIG_HIGHBITDEPTH
9183   uint8_t *tmp_buf;
9184 
9185 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9186   int rate2_bmc_nocoeff;
9187   MB_MODE_INFO best_bmc_mbmi;
9188   int rate_mv_bmc;
9189 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9190   int64_t rd = INT64_MAX;
9191   BUFFER_SET orig_dst, tmp_dst;
9192   int rs = 0;
9193 
9194   int skip_txfm_sb = 0;
9195   int64_t skip_sse_sb = INT64_MAX;
9196   int16_t mode_ctx;
9197 #if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
9198   // dummy fillers
9199   mbmi->ncobmc_mode[0] = NO_OVERLAP;
9200   mbmi->ncobmc_mode[1] = NO_OVERLAP;
9201 #endif
9202 
9203 #if CONFIG_INTERINTRA
9204   int compmode_interintra_cost = 0;
9205   mbmi->use_wedge_interintra = 0;
9206 #endif
9207 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
9208   int compmode_interinter_cost = 0;
9209   mbmi->interinter_compound_type = COMPOUND_AVERAGE;
9210 #endif
9211 #if CONFIG_LGT_FROM_PRED
9212   mbmi->use_lgt = 0;
9213 #endif
9214 
9215 #if CONFIG_INTERINTRA
9216   if (!cm->allow_interintra_compound && is_comp_interintra_pred)
9217     return INT64_MAX;
9218 #endif  // CONFIG_INTERINTRA
9219 
9220   // is_comp_interintra_pred implies !is_comp_pred
9221   assert(!is_comp_interintra_pred || (!is_comp_pred));
9222   // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
9223   assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
9224 
9225 #if CONFIG_COMPOUND_SINGLEREF
9226   if (is_comp_pred || is_singleref_comp_mode)
9227 #else   // !CONFIG_COMPOUND_SINGLEREF
9228   if (is_comp_pred)
9229 #endif  // CONFIG_COMPOUND_SINGLEREF
9230     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
9231   else
9232     mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
9233                                          mbmi->ref_frame, bsize, -1);
9234 
9235 #if CONFIG_HIGHBITDEPTH
9236   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
9237     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
9238   else
9239 #endif  // CONFIG_HIGHBITDEPTH
9240     tmp_buf = tmp_buf_;
9241   // Make sure that we didn't leave the plane destination buffers set
9242   // to tmp_buf at the end of the last iteration
9243   assert(xd->plane[0].dst.buf != tmp_buf);
9244 
9245 #if CONFIG_WARPED_MOTION
9246   mbmi->num_proj_ref[0] = 0;
9247   mbmi->num_proj_ref[1] = 0;
9248 #endif  // CONFIG_WARPED_MOTION
9249 
9250   if (is_comp_pred) {
9251     if (frame_mv[refs[0]].as_int == INVALID_MV ||
9252         frame_mv[refs[1]].as_int == INVALID_MV)
9253       return INT64_MAX;
9254 #if CONFIG_COMPOUND_SINGLEREF
9255   } else if (is_singleref_comp_mode) {
9256     if (frame_mv[refs[0]].as_int == INVALID_MV ||
9257         frame_comp_mv[refs[0]].as_int == INVALID_MV)
9258       return INT64_MAX;
9259 #endif  // CONFIG_COMPOUND_SINGLEREF
9260   }
9261 
9262   mbmi->motion_mode = SIMPLE_TRANSLATION;
9263   if (have_newmv_in_inter_mode(this_mode)) {
9264     const int64_t ret_val =
9265         handle_newmv(cpi, x, bsize, mode_mv,
9266 #if CONFIG_COMPOUND_SINGLEREF
9267                      mode_comp_mv,
9268 #endif  // CONFIG_COMPOUND_SINGLEREF
9269                      mi_row, mi_col, &rate_mv, single_newmv, args);
9270     if (ret_val != 0)
9271       return ret_val;
9272     else
9273       rd_stats->rate += rate_mv;
9274   }
9275   for (i = 0; i < is_comp_pred + 1; ++i) {
9276     cur_mv[i] = frame_mv[refs[i]];
9277     // Clip "next_nearest" so that it does not extend to far out of image
9278     if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
9279     if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
9280     mbmi->mv[i].as_int = cur_mv[i].as_int;
9281   }
9282 
9283 #if CONFIG_COMPOUND_SINGLEREF
9284   if (!is_comp_pred && is_singleref_comp_mode) {
9285     cur_mv[1] = frame_comp_mv[refs[0]];
9286     // Clip "next_nearest" so that it does not extend to far out of image
9287     if (this_mode != NEWMV) clamp_mv2(&cur_mv[1].as_mv, xd);
9288     if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
9289     mbmi->mv[1].as_int = cur_mv[1].as_int;
9290   }
9291 #endif  // CONFIG_COMPOUND_SINGLEREF
9292 
9293   if (this_mode == NEAREST_NEARESTMV) {
9294     if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
9295       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
9296       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
9297 
9298       for (i = 0; i < 2; ++i) {
9299         clamp_mv2(&cur_mv[i].as_mv, xd);
9300         if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
9301         mbmi->mv[i].as_int = cur_mv[i].as_int;
9302       }
9303     }
9304   }
9305 
9306   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
9307 #if CONFIG_COMPOUND_SINGLEREF
9308     if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
9309         this_mode == SR_NEAREST_NEARMV)
9310 #else   // !CONFIG_COMPOUND_SINGLEREF
9311     if (this_mode == NEAREST_NEWMV)
9312 #endif  // CONFIG_COMPOUND_SINGLEREF
9313     {
9314       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
9315 
9316 #if CONFIG_AMVR
9317       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
9318                          cm->cur_frame_mv_precision_level);
9319 #else
9320       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
9321 #endif
9322       clamp_mv2(&cur_mv[0].as_mv, xd);
9323       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
9324       mbmi->mv[0].as_int = cur_mv[0].as_int;
9325     }
9326 
9327     if (this_mode == NEW_NEARESTMV) {
9328       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
9329 
9330 #if CONFIG_AMVR
9331       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
9332                          cm->cur_frame_mv_precision_level);
9333 #else
9334       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
9335 #endif
9336       clamp_mv2(&cur_mv[1].as_mv, xd);
9337       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
9338       mbmi->mv[1].as_int = cur_mv[1].as_int;
9339     }
9340   }
9341 
9342   if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
9343     int ref_mv_idx = mbmi->ref_mv_idx + 1;
9344     if (this_mode == NEAR_NEWMV ||
9345 #if CONFIG_COMPOUND_SINGLEREF
9346         this_mode == SR_NEAR_NEWMV ||
9347 #endif  // CONFIG_COMPOUND_SINGLEREF
9348         this_mode == NEAR_NEARMV) {
9349       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
9350 
9351 #if CONFIG_AMVR
9352       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
9353                          cm->cur_frame_mv_precision_level);
9354 #else
9355       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
9356 #endif
9357       clamp_mv2(&cur_mv[0].as_mv, xd);
9358       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
9359       mbmi->mv[0].as_int = cur_mv[0].as_int;
9360     }
9361 
9362     if (this_mode == NEW_NEARMV ||
9363 #if CONFIG_COMPOUND_SINGLEREF
9364         this_mode == SR_NEAREST_NEARMV ||
9365 #endif  // CONFIG_COMPOUND_SINGLEREF
9366         this_mode == NEAR_NEARMV) {
9367 #if CONFIG_COMPOUND_SINGLEREF
9368       if (this_mode == SR_NEAREST_NEARMV)
9369         cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
9370       else
9371 #endif  // CONFIG_COMPOUND_SINGLEREF
9372         cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
9373 
9374 #if CONFIG_AMVR
9375       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
9376                          cm->cur_frame_mv_precision_level);
9377 #else
9378       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
9379 #endif
9380       clamp_mv2(&cur_mv[1].as_mv, xd);
9381       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
9382       mbmi->mv[1].as_int = cur_mv[1].as_int;
9383     }
9384   }
9385 
9386   // do first prediction into the destination buffer. Do the next
9387   // prediction into a temporary buffer. Then keep track of which one
9388   // of these currently holds the best predictor, and use the other
9389   // one for future predictions. In the end, copy from tmp_buf to
9390   // dst if necessary.
9391   for (i = 0; i < MAX_MB_PLANE; i++) {
9392     tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
9393     tmp_dst.stride[i] = MAX_SB_SIZE;
9394   }
9395   for (i = 0; i < MAX_MB_PLANE; i++) {
9396     orig_dst.plane[i] = xd->plane[i].dst.buf;
9397     orig_dst.stride[i] = xd->plane[i].dst.stride;
9398   }
9399 
9400   // We don't include the cost of the second reference here, because there
9401   // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
9402   // words if you present them in that order, the second one is always known
9403   // if the first is known.
9404   //
9405   // Under some circumstances we discount the cost of new mv mode to encourage
9406   // initiation of a motion field.
9407   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
9408                           refs[0])) {
9409     rd_stats->rate += AOMMIN(
9410         cost_mv_ref(x, this_mode, mode_ctx),
9411         cost_mv_ref(x, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, mode_ctx));
9412   } else {
9413     rd_stats->rate += cost_mv_ref(x, this_mode, mode_ctx);
9414   }
9415 
9416   if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
9417       mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV)
9418     return INT64_MAX;
9419 
9420   int64_t ret_val = interpolation_filter_search(
9421       x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
9422       &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
9423   if (ret_val != 0) return ret_val;
9424 
9425 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9426   best_bmc_mbmi = *mbmi;
9427   rate2_bmc_nocoeff = rd_stats->rate;
9428   if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
9429   rate_mv_bmc = rate_mv;
9430 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9431 
9432 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
9433 #if CONFIG_COMPOUND_SINGLEREF
9434   if (is_comp_pred || is_singleref_comp_mode)
9435 #else
9436   if (is_comp_pred)
9437 #endif  // CONFIG_COMPOUND_SINGLEREF
9438   {
9439     int rate_sum, rs2;
9440     int64_t dist_sum;
9441     int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
9442     INTERINTER_COMPOUND_DATA best_compound_data;
9443     int_mv best_mv[2];
9444     int best_tmp_rate_mv = rate_mv;
9445     int tmp_skip_txfm_sb;
9446     int64_t tmp_skip_sse_sb;
9447     DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
9448     DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
9449     uint8_t *preds0[1] = { pred0 };
9450     uint8_t *preds1[1] = { pred1 };
9451     int strides[1] = { bw };
9452     int tmp_rate_mv;
9453     int masked_compound_used = is_any_masked_compound_used(bsize);
9454 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
9455     masked_compound_used = masked_compound_used && cm->allow_masked_compound;
9456 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
9457     COMPOUND_TYPE cur_type;
9458     int best_compmode_interinter_cost = 0;
9459 
9460     best_mv[0].as_int = cur_mv[0].as_int;
9461     best_mv[1].as_int = cur_mv[1].as_int;
9462     memset(&best_compound_data, 0, sizeof(best_compound_data));
9463 #if CONFIG_COMPOUND_SEGMENT
9464     uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
9465     best_compound_data.seg_mask = tmp_mask_buf;
9466 #endif  // CONFIG_COMPOUND_SEGMENT
9467 
9468 #if CONFIG_COMPOUND_SINGLEREF
9469     // TODO(zoeliu): To further check whether the following setups are needed.
9470     // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
9471     // the 1st one.
9472     if (!is_comp_pred && is_singleref_comp_mode) {
9473       xd->block_refs[1] = xd->block_refs[0];
9474       for (i = 0; i < MAX_MB_PLANE; i++)
9475         xd->plane[i].pre[1] = xd->plane[i].pre[0];
9476     }
9477 #endif  // CONFIG_COMPOUND_SINGLEREF
9478 
9479     if (masked_compound_used) {
9480       // get inter predictors to use for masked compound modes
9481       av1_build_inter_predictors_for_planes_single_buf(
9482           xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
9483       av1_build_inter_predictors_for_planes_single_buf(
9484           xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
9485     }
9486 
9487     for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
9488       if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
9489       if (!is_interinter_compound_used(cur_type, bsize)) continue;
9490       tmp_rate_mv = rate_mv;
9491       best_rd_cur = INT64_MAX;
9492       mbmi->interinter_compound_type = cur_type;
9493       int masked_type_cost = 0;
9494       if (masked_compound_used) {
9495 #if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
9496         if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
9497           masked_type_cost += av1_cost_literal(1);
9498         else
9499 #endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
9500           masked_type_cost +=
9501               x->compound_type_cost[bsize][mbmi->interinter_compound_type];
9502       }
9503       rs2 = av1_cost_literal(get_interinter_compound_type_bits(
9504                 bsize, mbmi->interinter_compound_type)) +
9505             masked_type_cost;
9506 
9507       switch (cur_type) {
9508         case COMPOUND_AVERAGE:
9509           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
9510                                          bsize);
9511           av1_subtract_plane(x, bsize, 0);
9512           rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
9513                                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
9514                                    INT64_MAX);
9515           if (rd != INT64_MAX)
9516             best_rd_cur = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
9517           best_rd_compound = best_rd_cur;
9518           break;
9519 #if CONFIG_WEDGE
9520         case COMPOUND_WEDGE:
9521           if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
9522               best_rd_compound / 3 < ref_best_rd) {
9523             best_rd_cur = build_and_cost_compound_type(
9524                 cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
9525                 &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
9526           }
9527           break;
9528 #endif  // CONFIG_WEDGE
9529 #if CONFIG_COMPOUND_SEGMENT
9530         case COMPOUND_SEG:
9531           if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
9532               best_rd_compound / 3 < ref_best_rd) {
9533             best_rd_cur = build_and_cost_compound_type(
9534                 cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
9535                 &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
9536           }
9537           break;
9538 #endif  // CONFIG_COMPOUND_SEGMENT
9539         default: assert(0); return 0;
9540       }
9541 
9542       if (best_rd_cur < best_rd_compound) {
9543         best_rd_compound = best_rd_cur;
9544 #if CONFIG_WEDGE
9545         best_compound_data.wedge_index = mbmi->wedge_index;
9546         best_compound_data.wedge_sign = mbmi->wedge_sign;
9547 #endif  // CONFIG_WEDGE
9548 #if CONFIG_COMPOUND_SEGMENT
9549         best_compound_data.mask_type = mbmi->mask_type;
9550         memcpy(best_compound_data.seg_mask, xd->seg_mask,
9551                2 * MAX_SB_SQUARE * sizeof(uint8_t));
9552 #endif  // CONFIG_COMPOUND_SEGMENT
9553         best_compound_data.interinter_compound_type =
9554             mbmi->interinter_compound_type;
9555         best_compmode_interinter_cost = rs2;
9556         if (have_newmv_in_inter_mode(this_mode)) {
9557           if (use_masked_motion_search(cur_type)) {
9558             best_tmp_rate_mv = tmp_rate_mv;
9559             best_mv[0].as_int = mbmi->mv[0].as_int;
9560             best_mv[1].as_int = mbmi->mv[1].as_int;
9561           } else {
9562             best_mv[0].as_int = cur_mv[0].as_int;
9563             best_mv[1].as_int = cur_mv[1].as_int;
9564           }
9565         }
9566       }
9567       // reset to original mvs for next iteration
9568       mbmi->mv[0].as_int = cur_mv[0].as_int;
9569       mbmi->mv[1].as_int = cur_mv[1].as_int;
9570     }
9571 #if CONFIG_WEDGE
9572     mbmi->wedge_index = best_compound_data.wedge_index;
9573     mbmi->wedge_sign = best_compound_data.wedge_sign;
9574 #endif  // CONFIG_WEDGE
9575 #if CONFIG_COMPOUND_SEGMENT
9576     mbmi->mask_type = best_compound_data.mask_type;
9577     memcpy(xd->seg_mask, best_compound_data.seg_mask,
9578            2 * MAX_SB_SQUARE * sizeof(uint8_t));
9579 #endif  // CONFIG_COMPOUND_SEGMENT
9580     mbmi->interinter_compound_type =
9581         best_compound_data.interinter_compound_type;
9582     if (have_newmv_in_inter_mode(this_mode)) {
9583       mbmi->mv[0].as_int = best_mv[0].as_int;
9584       mbmi->mv[1].as_int = best_mv[1].as_int;
9585       xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
9586       xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
9587       if (use_masked_motion_search(mbmi->interinter_compound_type)) {
9588         rd_stats->rate += best_tmp_rate_mv - rate_mv;
9589         rate_mv = best_tmp_rate_mv;
9590       }
9591     }
9592 
9593     if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
9594       restore_dst_buf(xd, orig_dst);
9595       return INT64_MAX;
9596     }
9597 
9598     pred_exists = 0;
9599 
9600     compmode_interinter_cost = best_compmode_interinter_cost;
9601   }
9602 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
9603 
9604 #if CONFIG_INTERINTRA
9605   if (is_comp_interintra_pred) {
9606     INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
9607     int64_t best_interintra_rd = INT64_MAX;
9608     int rmode, rate_sum;
9609     int64_t dist_sum;
9610     int j;
9611     int tmp_rate_mv = 0;
9612     int tmp_skip_txfm_sb;
9613     int64_t tmp_skip_sse_sb;
9614     DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
9615     uint8_t *intrapred;
9616 
9617 #if CONFIG_HIGHBITDEPTH
9618     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
9619       intrapred = CONVERT_TO_BYTEPTR(intrapred_);
9620     else
9621 #endif  // CONFIG_HIGHBITDEPTH
9622       intrapred = intrapred_;
9623 
9624     mbmi->ref_frame[1] = NONE_FRAME;
9625     for (j = 0; j < MAX_MB_PLANE; j++) {
9626       xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
9627       xd->plane[j].dst.stride = bw;
9628     }
9629     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, bsize);
9630     restore_dst_buf(xd, orig_dst);
9631     mbmi->ref_frame[1] = INTRA_FRAME;
9632     mbmi->use_wedge_interintra = 0;
9633 
9634     for (j = 0; j < INTERINTRA_MODES; ++j) {
9635       mbmi->interintra_mode = (INTERINTRA_MODE)j;
9636       rmode = interintra_mode_cost[mbmi->interintra_mode];
9637       av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
9638                                                 intrapred, bw);
9639       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
9640       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
9641                       &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
9642       rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
9643       if (rd < best_interintra_rd) {
9644         best_interintra_rd = rd;
9645         best_interintra_mode = mbmi->interintra_mode;
9646       }
9647     }
9648     mbmi->interintra_mode = best_interintra_mode;
9649     rmode = interintra_mode_cost[mbmi->interintra_mode];
9650     av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
9651                                               intrapred, bw);
9652     av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
9653     av1_subtract_plane(x, bsize, 0);
9654     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
9655                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
9656     if (rd != INT64_MAX)
9657       rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
9658     best_interintra_rd = rd;
9659 
9660     if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
9661       // Don't need to call restore_dst_buf here
9662       return INT64_MAX;
9663     }
9664 #if CONFIG_WEDGE
9665     if (is_interintra_wedge_used(bsize)) {
9666       int64_t best_interintra_rd_nowedge = INT64_MAX;
9667       int64_t best_interintra_rd_wedge = INT64_MAX;
9668       int_mv tmp_mv;
9669       int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
9670       if (rd != INT64_MAX)
9671         rd = RDCOST(x->rdmult, rmode + rate_mv + rwedge + rate_sum, dist_sum);
9672       best_interintra_rd_nowedge = best_interintra_rd;
9673 
9674       // Disable wedge search if source variance is small
9675       if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
9676         mbmi->use_wedge_interintra = 1;
9677 
9678         rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
9679                  av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
9680 
9681         best_interintra_rd_wedge =
9682             pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
9683 
9684         best_interintra_rd_wedge +=
9685             RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
9686         // Refine motion vector.
9687         if (have_newmv_in_inter_mode(this_mode)) {
9688           // get negative of mask
9689           const uint8_t *mask = av1_get_contiguous_soft_mask(
9690               mbmi->interintra_wedge_index, 1, bsize);
9691           tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
9692           compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
9693                                         mi_col, intrapred, mask, bw,
9694                                         &tmp_rate_mv, 0);
9695           mbmi->mv[0].as_int = tmp_mv.as_int;
9696           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
9697                                          bsize);
9698           model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
9699                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
9700           rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
9701                       dist_sum);
9702           if (rd >= best_interintra_rd_wedge) {
9703             tmp_mv.as_int = cur_mv[0].as_int;
9704             tmp_rate_mv = rate_mv;
9705           }
9706         } else {
9707           tmp_mv.as_int = cur_mv[0].as_int;
9708           tmp_rate_mv = rate_mv;
9709           av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
9710         }
9711         // Evaluate closer to true rd
9712         av1_subtract_plane(x, bsize, 0);
9713         rd =
9714             estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
9715                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
9716         if (rd != INT64_MAX)
9717           rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
9718                       dist_sum);
9719         best_interintra_rd_wedge = rd;
9720         if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
9721           mbmi->use_wedge_interintra = 1;
9722           mbmi->mv[0].as_int = tmp_mv.as_int;
9723           rd_stats->rate += tmp_rate_mv - rate_mv;
9724           rate_mv = tmp_rate_mv;
9725         } else {
9726           mbmi->use_wedge_interintra = 0;
9727           mbmi->mv[0].as_int = cur_mv[0].as_int;
9728         }
9729       } else {
9730         mbmi->use_wedge_interintra = 0;
9731       }
9732     }
9733 #endif  // CONFIG_WEDGE
9734 
9735     pred_exists = 0;
9736     compmode_interintra_cost =
9737         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1) +
9738         interintra_mode_cost[mbmi->interintra_mode];
9739     if (is_interintra_wedge_used(bsize)) {
9740       compmode_interintra_cost += av1_cost_bit(
9741           cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
9742       if (mbmi->use_wedge_interintra) {
9743         compmode_interintra_cost +=
9744             av1_cost_literal(get_interintra_wedge_bits(bsize));
9745       }
9746     }
9747   } else if (is_interintra_allowed(mbmi)) {
9748     compmode_interintra_cost =
9749         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
9750   }
9751 #endif  // CONFIG_INTERINTRA
9752 
9753   if (pred_exists == 0) {
9754     int tmp_rate;
9755     int64_t tmp_dist;
9756     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
9757     model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
9758                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
9759     rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
9760   }
9761 
9762   if (!is_comp_pred)
9763     args->single_filter[this_mode][refs[0]] =
9764         av1_extract_interp_filter(mbmi->interp_filters, 0);
9765 
9766   if (args->modelled_rd != NULL) {
9767     if (is_comp_pred) {
9768       const int mode0 = compound_ref0_mode(this_mode);
9769       const int mode1 = compound_ref1_mode(this_mode);
9770       const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
9771                                  args->modelled_rd[mode1][refs[1]]);
9772       if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
9773         restore_dst_buf(xd, orig_dst);
9774         return INT64_MAX;
9775       }
9776     } else if (!is_comp_interintra_pred) {
9777       args->modelled_rd[this_mode][refs[0]] = rd;
9778     }
9779   }
9780 
9781   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
9782     // if current pred_error modeled rd is substantially more than the best
9783     // so far, do not bother doing full rd
9784     if (rd / 2 > ref_best_rd) {
9785       restore_dst_buf(xd, orig_dst);
9786       return INT64_MAX;
9787     }
9788   }
9789 
9790 #if CONFIG_INTERINTRA
9791   rd_stats->rate += compmode_interintra_cost;
9792 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9793   rate2_bmc_nocoeff += compmode_interintra_cost;
9794 #endif
9795 #endif
9796 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
9797   rd_stats->rate += compmode_interinter_cost;
9798 #endif
9799 
9800   ret_val = motion_mode_rd(
9801       cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mode_mv,
9802       mi_row, mi_col, args, ref_best_rd, refs, rate_mv,
9803 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9804       single_newmv, rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
9805 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9806       rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
9807   if (ret_val != 0) return ret_val;
9808 
9809   return 0;  // The rate-distortion cost will be re-calculated by caller.
9810 }
9811 
9812 #if CONFIG_INTRABC
9813 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
9814                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
9815                                        int64_t best_rd) {
9816   const AV1_COMMON *const cm = &cpi->common;
9817   if (!av1_allow_intrabc(bsize, cm)) return INT64_MAX;
9818 
9819   MACROBLOCKD *const xd = &x->e_mbd;
9820   const TileInfo *tile = &xd->tile;
9821   MODE_INFO *const mi = xd->mi[0];
9822   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
9823   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
9824   const int w = block_size_wide[bsize];
9825   const int h = block_size_high[bsize];
9826   const int sb_row = mi_row / MAX_MIB_SIZE;
9827   const int sb_col = mi_col / MAX_MIB_SIZE;
9828 
9829   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
9830   MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
9831   int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
9832   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
9833                    mbmi_ext->ref_mv_stack[ref_frame],
9834                    mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
9835                    NULL, NULL, mbmi_ext->mode_context);
9836 
9837   int_mv nearestmv, nearmv;
9838   av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
9839 
9840   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
9841   if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
9842   mbmi_ext->ref_mvs[INTRA_FRAME][0] = dv_ref;
9843 
9844   struct buf_2d yv12_mb[MAX_MB_PLANE];
9845   av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
9846   for (int i = 0; i < MAX_MB_PLANE; ++i) {
9847     xd->plane[i].pre[0] = yv12_mb[i];
9848   }
9849 
9850   enum IntrabcMotionDirection {
9851     IBC_MOTION_ABOVE,
9852     IBC_MOTION_LEFT,
9853     IBC_MOTION_DIRECTIONS
9854   };
9855 
9856   MB_MODE_INFO *mbmi = &mi->mbmi;
9857   MB_MODE_INFO best_mbmi = *mbmi;
9858   RD_STATS best_rdcost = *rd_cost;
9859   int best_skip = x->skip;
9860 
9861   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
9862        dir < IBC_MOTION_DIRECTIONS; ++dir) {
9863     const MvLimits tmp_mv_limits = x->mv_limits;
9864     switch (dir) {
9865       case IBC_MOTION_ABOVE:
9866         x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
9867         x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
9868         x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
9869         x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
9870         break;
9871       case IBC_MOTION_LEFT:
9872         x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
9873         x->mv_limits.col_max = (sb_col * MAX_MIB_SIZE - mi_col) * MI_SIZE - w;
9874         // TODO(aconverse@google.com): Minimize the overlap between above and
9875         // left areas.
9876         x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
9877         int bottom_coded_mi_edge =
9878             AOMMIN((sb_row + 1) * MAX_MIB_SIZE, tile->mi_row_end);
9879         x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
9880         break;
9881       default: assert(0);
9882     }
9883     assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
9884     assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
9885     assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
9886     assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
9887     av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
9888 
9889     if (x->mv_limits.col_max < x->mv_limits.col_min ||
9890         x->mv_limits.row_max < x->mv_limits.row_min) {
9891       x->mv_limits = tmp_mv_limits;
9892       continue;
9893     }
9894 
9895     int step_param = cpi->mv_step_param;
9896     MV mvp_full = dv_ref.as_mv;
9897     mvp_full.col >>= 3;
9898     mvp_full.row >>= 3;
9899     int sadpb = x->sadperbit16;
9900     int cost_list[5];
9901 #if CONFIG_HASH_ME
9902     int bestsme = av1_full_pixel_search(
9903         cpi, x, bsize, &mvp_full, step_param, sadpb,
9904         cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
9905         (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
9906 #else
9907     int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
9908                                         sadpb, cond_cost_list(cpi, cost_list),
9909                                         &dv_ref.as_mv, INT_MAX, 1);
9910 #endif
9911 
9912     x->mv_limits = tmp_mv_limits;
9913     if (bestsme == INT_MAX) continue;
9914     mvp_full = x->best_mv.as_mv;
9915     MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
9916     if (mv_check_bounds(&x->mv_limits, &dv)) continue;
9917     if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
9918 
9919     memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
9920     mbmi->use_intrabc = 1;
9921     mbmi->mode = DC_PRED;
9922     mbmi->uv_mode = UV_DC_PRED;
9923     mbmi->mv[0].as_mv = dv;
9924     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
9925     mbmi->skip = 0;
9926     x->skip = 0;
9927     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
9928 
9929     assert(x->mvcost == x->mv_cost_stack[0]);
9930     // TODO(aconverse@google.com): The full motion field defining discount
9931     // in MV_COST_WEIGHT is too large. Explore other values.
9932     int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
9933                                   x->mvcost, MV_COST_WEIGHT_SUB);
9934     const int rate_mode = x->intrabc_cost[1];
9935     RD_STATS rd_stats, rd_stats_uv;
9936     av1_subtract_plane(x, bsize, 0);
9937     super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
9938     super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
9939     av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
9940 #if CONFIG_RD_DEBUG
9941     mbmi->rd_stats = rd_stats;
9942 #endif
9943 
9944 #if CONFIG_VAR_TX
9945     // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
9946     const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
9947     const int height = block_size_high[bsize] >> tx_size_high_log2[0];
9948     int idx, idy;
9949     for (idy = 0; idy < height; ++idy)
9950       for (idx = 0; idx < width; ++idx)
9951         mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
9952     mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
9953 #endif  // CONFIG_VAR_TX
9954 
9955     const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
9956 
9957     RD_STATS rdc_noskip;
9958     av1_init_rd_stats(&rdc_noskip);
9959     rdc_noskip.rate =
9960         rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
9961     rdc_noskip.dist = rd_stats.dist;
9962     rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
9963     if (rdc_noskip.rdcost < best_rd) {
9964       best_rd = rdc_noskip.rdcost;
9965       best_mbmi = *mbmi;
9966       best_skip = x->skip;
9967       best_rdcost = rdc_noskip;
9968     }
9969 
9970     x->skip = 1;
9971     mbmi->skip = 1;
9972     RD_STATS rdc_skip;
9973     av1_init_rd_stats(&rdc_skip);
9974     rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
9975     rdc_skip.dist = rd_stats.sse;
9976     rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
9977     if (rdc_skip.rdcost < best_rd) {
9978       best_rd = rdc_skip.rdcost;
9979       best_mbmi = *mbmi;
9980       best_skip = x->skip;
9981       best_rdcost = rdc_skip;
9982     }
9983   }
9984   *mbmi = best_mbmi;
9985   *rd_cost = best_rdcost;
9986   x->skip = best_skip;
9987   return best_rd;
9988 }
9989 #endif  // CONFIG_INTRABC
9990 
9991 void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
9992                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
9993                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
9994   const AV1_COMMON *const cm = &cpi->common;
9995   MACROBLOCKD *const xd = &x->e_mbd;
9996   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
9997   struct macroblockd_plane *const pd = xd->plane;
9998   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
9999   int y_skip = 0, uv_skip = 0;
10000   int64_t dist_y = 0, dist_uv = 0;
10001   TX_SIZE max_uv_tx_size;
10002   const int unify_bsize = CONFIG_CB4X4;
10003 
10004   ctx->skip = 0;
10005   mbmi->ref_frame[0] = INTRA_FRAME;
10006   mbmi->ref_frame[1] = NONE_FRAME;
10007 #if CONFIG_INTRABC
10008   mbmi->use_intrabc = 0;
10009   mbmi->mv[0].as_int = 0;
10010 #endif  // CONFIG_INTRABC
10011 #if CONFIG_LGT_FROM_PRED
10012   mbmi->use_lgt = 0;
10013 #endif
10014 
10015   const int64_t intra_yrd =
10016       (bsize >= BLOCK_8X8 || unify_bsize)
10017           ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
10018                                    &y_skip, bsize, best_rd)
10019           : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
10020                                          &dist_y, &y_skip, best_rd);
10021 
10022   if (intra_yrd < best_rd) {
10023 #if CONFIG_CFL
10024 #if CONFIG_CB4X4
10025     // Only store reconstructed luma when there's chroma RDO. When there's no
10026     // chroma RDO, the reconstructed luma will be stored in encode_superblock().
10027     xd->cfl->store_y = !x->skip_chroma_rd;
10028 #else
10029     xd->cfl->store_y = 1;
10030 #endif  // CONFIG_CB4X4
10031     if (xd->cfl->store_y) {
10032       // Perform one extra call to txfm_rd_in_plane(), with the values chosen
10033       // during luma RDO, so we can store reconstructed luma values
10034       RD_STATS this_rd_stats;
10035       txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
10036                        mbmi->sb_type, mbmi->tx_size,
10037                        cpi->sf.use_fast_coef_costing);
10038       xd->cfl->store_y = 0;
10039     }
10040 #endif  // CONFIG_CFL
10041     max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
10042                                      [pd[1].subsampling_y];
10043     init_sbuv_mode(mbmi);
10044 #if CONFIG_CB4X4
10045     if (!x->skip_chroma_rd)
10046       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
10047                               &uv_skip, bsize, max_uv_tx_size);
10048 #else
10049     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
10050                             &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
10051 #endif  // CONFIG_CB4X4
10052 
10053     if (y_skip && (uv_skip || x->skip_chroma_rd)) {
10054       rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
10055                       av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
10056       rd_cost->dist = dist_y + dist_uv;
10057     } else {
10058       rd_cost->rate =
10059           rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
10060       rd_cost->dist = dist_y + dist_uv;
10061     }
10062     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
10063   } else {
10064     rd_cost->rate = INT_MAX;
10065   }
10066 
10067 #if CONFIG_INTRABC
10068   if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
10069     best_rd = rd_cost->rdcost;
10070   if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
10071     ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
10072     assert(rd_cost->rate != INT_MAX);
10073     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
10074   }
10075 #endif
10076   if (rd_cost->rate == INT_MAX) return;
10077 
10078   ctx->mic = *xd->mi[0];
10079   ctx->mbmi_ext = *x->mbmi_ext;
10080 }
10081 
10082 // Do we have an internal image edge (e.g. formatting bars).
10083 int av1_internal_image_edge(const AV1_COMP *cpi) {
10084   return (cpi->oxcf.pass == 2) &&
10085          ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
10086           (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
10087 }
10088 
10089 // Checks to see if a super block is on a horizontal image edge.
10090 // In most cases this is the "real" edge unless there are formatting
10091 // bars embedded in the stream.
10092 int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
10093   int top_edge = 0;
10094   int bottom_edge = cpi->common.mi_rows;
10095   int is_active_h_edge = 0;
10096 
10097   // For two pass account for any formatting bars detected.
10098   if (cpi->oxcf.pass == 2) {
10099     const TWO_PASS *const twopass = &cpi->twopass;
10100 
10101     // The inactive region is specified in MBs not mi units.
10102     // The image edge is in the following MB row.
10103     top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
10104 
10105     bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
10106     bottom_edge = AOMMAX(top_edge, bottom_edge);
10107   }
10108 
10109   if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
10110       ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
10111     is_active_h_edge = 1;
10112   }
10113   return is_active_h_edge;
10114 }
10115 
10116 // Checks to see if a super block is on a vertical image edge.
10117 // In most cases this is the "real" edge unless there are formatting
10118 // bars embedded in the stream.
10119 int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
10120   int left_edge = 0;
10121   int right_edge = cpi->common.mi_cols;
10122   int is_active_v_edge = 0;
10123 
10124   // For two pass account for any formatting bars detected.
10125   if (cpi->oxcf.pass == 2) {
10126     const TWO_PASS *const twopass = &cpi->twopass;
10127 
10128     // The inactive region is specified in MBs not mi units.
10129     // The image edge is in the following MB row.
10130     left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
10131 
10132     right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
10133     right_edge = AOMMAX(left_edge, right_edge);
10134   }
10135 
10136   if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
10137       ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
10138     is_active_v_edge = 1;
10139   }
10140   return is_active_v_edge;
10141 }
10142 
10143 // Checks to see if a super block is at the edge of the active image.
10144 // In most cases this is the "real" edge unless there are formatting
10145 // bars embedded in the stream.
10146 int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
10147   return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
10148          av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
10149 }
10150 
10151 static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
10152   MACROBLOCKD *const xd = &x->e_mbd;
10153   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
10154   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
10155   const BLOCK_SIZE bsize = mbmi->sb_type;
10156   assert(bsize >= BLOCK_8X8);
10157   int src_stride = x->plane[1].src.stride;
10158   const uint8_t *const src_u = x->plane[1].src.buf;
10159   const uint8_t *const src_v = x->plane[2].src.buf;
10160   float *const data = x->palette_buffer->kmeans_data_buf;
10161   float centroids[2 * PALETTE_MAX_SIZE];
10162   uint8_t *const color_map = xd->plane[1].color_index_map;
10163   int r, c;
10164 #if CONFIG_HIGHBITDEPTH
10165   const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
10166   const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
10167 #endif  // CONFIG_HIGHBITDEPTH
10168   int plane_block_width, plane_block_height, rows, cols;
10169   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
10170                            &plane_block_height, &rows, &cols);
10171   (void)cpi;
10172 
10173   for (r = 0; r < rows; ++r) {
10174     for (c = 0; c < cols; ++c) {
10175 #if CONFIG_HIGHBITDEPTH
10176       if (cpi->common.use_highbitdepth) {
10177         data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
10178         data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
10179       } else {
10180 #endif  // CONFIG_HIGHBITDEPTH
10181         data[(r * cols + c) * 2] = src_u[r * src_stride + c];
10182         data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
10183 #if CONFIG_HIGHBITDEPTH
10184       }
10185 #endif  // CONFIG_HIGHBITDEPTH
10186     }
10187   }
10188 
10189   for (r = 1; r < 3; ++r) {
10190     for (c = 0; c < pmi->palette_size[1]; ++c) {
10191       centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
10192     }
10193   }
10194 
10195   av1_calc_indices(data, centroids, color_map, rows * cols,
10196                    pmi->palette_size[1], 2);
10197   extend_palette_color_map(color_map, cols, rows, plane_block_width,
10198                            plane_block_height);
10199 }
10200 
10201 #if CONFIG_FILTER_INTRA
10202 static void pick_filter_intra_interframe(
10203     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
10204     int mi_col, int *rate_uv_intra, int *rate_uv_tokenonly, int64_t *dist_uv,
10205     int *skip_uv, UV_PREDICTION_MODE *mode_uv,
10206     FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
10207 #if CONFIG_EXT_INTRA
10208     int8_t *uv_angle_delta,
10209 #endif  // CONFIG_EXT_INTRA
10210     PALETTE_MODE_INFO *pmi_uv, int palette_ctx, int skip_mask,
10211     unsigned int *ref_costs_single, int64_t *best_rd, int64_t *best_intra_rd,
10212     PREDICTION_MODE *best_intra_mode, int *best_mode_index, int *best_skip2,
10213     int *best_mode_skippable,
10214 #if CONFIG_SUPERTX
10215     int *returnrate_nocoef,
10216 #endif  // CONFIG_SUPERTX
10217     int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) {
10218   const AV1_COMMON *const cm = &cpi->common;
10219   MACROBLOCKD *const xd = &x->e_mbd;
10220   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
10221   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
10222   const int try_palette =
10223       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
10224   int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
10225   int dc_mode_index;
10226   const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
10227   int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
10228   int64_t distortion_uv, model_rd = INT64_MAX;
10229   TX_SIZE uv_tx;
10230 
10231   for (i = 0; i < MAX_MODES; ++i)
10232     if (av1_mode_order[i].mode == DC_PRED &&
10233         av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
10234       break;
10235   dc_mode_index = i;
10236   assert(i < MAX_MODES);
10237 
10238   // TODO(huisu): use skip_mask for further speedup.
10239   (void)skip_mask;
10240   mbmi->mode = DC_PRED;
10241   mbmi->uv_mode = UV_DC_PRED;
10242   mbmi->ref_frame[0] = INTRA_FRAME;
10243   mbmi->ref_frame[1] = NONE_FRAME;
10244   if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
10245                                 &skippable, bsize, intra_mode_cost[mbmi->mode],
10246                                 &this_rd, &model_rd, 0)) {
10247     return;
10248   }
10249   if (rate_y == INT_MAX) return;
10250 
10251   uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
10252                           [xd->plane[1].subsampling_y];
10253   if (rate_uv_intra[uv_tx] == INT_MAX) {
10254     choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
10255                          &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
10256                          &skip_uv[uv_tx], &mode_uv[uv_tx]);
10257     if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
10258     filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
10259 #if CONFIG_EXT_INTRA
10260     uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
10261 #endif  // CONFIG_EXT_INTRA
10262   }
10263 
10264   rate_uv = rate_uv_tokenonly[uv_tx];
10265   distortion_uv = dist_uv[uv_tx];
10266   skippable = skippable && skip_uv[uv_tx];
10267   mbmi->uv_mode = mode_uv[uv_tx];
10268   if (cm->allow_screen_content_tools) {
10269     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
10270     memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
10271            pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
10272            2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
10273   }
10274 #if CONFIG_EXT_INTRA
10275   mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
10276 #endif  // CONFIG_EXT_INTRA
10277   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
10278       filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
10279   if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
10280     mbmi->filter_intra_mode_info.filter_intra_mode[1] =
10281         filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
10282   }
10283 
10284   rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
10285           x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
10286   if (try_palette && mbmi->mode == DC_PRED)
10287     rate2 += av1_cost_bit(
10288         av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
10289 
10290   if (!xd->lossless[mbmi->segment_id]) {
10291     // super_block_yrd above includes the cost of the tx_size in the
10292     // tokenonly rate, but for intra blocks, tx_size is always coded
10293     // (prediction granularity), so we account for it in the full rate,
10294     // not the tokenonly rate.
10295     rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
10296   }
10297 
10298   rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
10299                         mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
10300   rate2 += write_uniform_cost(
10301       FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
10302 #if CONFIG_EXT_INTRA
10303   if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
10304       av1_use_angle_delta(bsize)) {
10305     rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
10306                                 MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
10307   }
10308 #endif  // CONFIG_EXT_INTRA
10309   if (mbmi->mode == DC_PRED) {
10310     rate2 +=
10311         av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
10312                      mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
10313     if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
10314       rate2 +=
10315           write_uniform_cost(FILTER_INTRA_MODES,
10316                              mbmi->filter_intra_mode_info.filter_intra_mode[1]);
10317   }
10318   distortion2 = distortion_y + distortion_uv;
10319   av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row,
10320                                mi_col);
10321 
10322   rate2 += ref_costs_single[INTRA_FRAME];
10323 
10324   if (skippable) {
10325     rate2 -= (rate_y + rate_uv);
10326     rate_y = 0;
10327     rate_uv = 0;
10328     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
10329   } else {
10330     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
10331   }
10332   this_rd = RDCOST(x->rdmult, rate2, distortion2);
10333 
10334   if (this_rd < *best_intra_rd) {
10335     *best_intra_rd = this_rd;
10336     *best_intra_mode = mbmi->mode;
10337   }
10338   for (i = 0; i < REFERENCE_MODES; ++i)
10339     best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
10340 
10341   if (this_rd < *best_rd) {
10342     *best_mode_index = dc_mode_index;
10343     mbmi->mv[0].as_int = 0;
10344     rd_cost->rate = rate2;
10345 #if CONFIG_SUPERTX
10346     if (x->skip)
10347       *returnrate_nocoef = rate2;
10348     else
10349       *returnrate_nocoef = rate2 - rate_y - rate_uv;
10350     *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
10351     *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
10352                                        mbmi->ref_frame[0] != INTRA_FRAME);
10353 #endif  // CONFIG_SUPERTX
10354     rd_cost->dist = distortion2;
10355     rd_cost->rdcost = this_rd;
10356     *best_rd = this_rd;
10357     *best_mbmode = *mbmi;
10358     *best_skip2 = 0;
10359     *best_mode_skippable = skippable;
10360   }
10361 }
10362 #endif  // CONFIG_FILTER_INTRA
10363 
10364 #if CONFIG_MOTION_VAR
10365 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
10366                                       const MACROBLOCKD *xd, int mi_row,
10367                                       int mi_col, const uint8_t *above,
10368                                       int above_stride, const uint8_t *left,
10369                                       int left_stride);
10370 #endif  // CONFIG_MOTION_VAR
10371 
10372 void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
10373                                MACROBLOCK *x, int mi_row, int mi_col,
10374                                RD_STATS *rd_cost,
10375 #if CONFIG_SUPERTX
10376                                int *returnrate_nocoef,
10377 #endif  // CONFIG_SUPERTX
10378                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
10379                                int64_t best_rd_so_far) {
10380   const AV1_COMMON *const cm = &cpi->common;
10381   const RD_OPT *const rd_opt = &cpi->rd;
10382   const SPEED_FEATURES *const sf = &cpi->sf;
10383   MACROBLOCKD *const xd = &x->e_mbd;
10384   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
10385   const int try_palette =
10386       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
10387   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
10388   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
10389   const struct segmentation *const seg = &cm->seg;
10390   PREDICTION_MODE this_mode;
10391   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
10392   unsigned char segment_id = mbmi->segment_id;
10393   int comp_pred, i, k;
10394   int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
10395 #if CONFIG_COMPOUND_SINGLEREF
10396   int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
10397 #endif  // CONFIG_COMPOUND_SINGLEREF
10398   struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
10399   int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
10400   int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
10401   int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
10402   static const int flag_list[TOTAL_REFS_PER_FRAME] = {
10403     0,
10404     AOM_LAST_FLAG,
10405 #if CONFIG_EXT_REFS
10406     AOM_LAST2_FLAG,
10407     AOM_LAST3_FLAG,
10408 #endif  // CONFIG_EXT_REFS
10409     AOM_GOLD_FLAG,
10410 #if CONFIG_EXT_REFS
10411     AOM_BWD_FLAG,
10412     AOM_ALT2_FLAG,
10413 #endif  // CONFIG_EXT_REFS
10414     AOM_ALT_FLAG
10415   };
10416   int64_t best_rd = best_rd_so_far;
10417   int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
10418   int64_t best_pred_diff[REFERENCE_MODES];
10419   int64_t best_pred_rd[REFERENCE_MODES];
10420   MB_MODE_INFO best_mbmode;
10421   int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
10422   int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
10423   int best_mode_skippable = 0;
10424   int midx, best_mode_index = -1;
10425   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
10426 #if CONFIG_EXT_COMP_REFS
10427   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
10428 #else
10429   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
10430 #endif  // CONFIG_EXT_COMP_REFS
10431   aom_prob comp_mode_p;
10432   int64_t best_intra_rd = INT64_MAX;
10433   unsigned int best_pred_sse = UINT_MAX;
10434   PREDICTION_MODE best_intra_mode = DC_PRED;
10435   int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
10436   int64_t dist_uvs[TX_SIZES_ALL];
10437   int skip_uvs[TX_SIZES_ALL];
10438   UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
10439   PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
10440 #if CONFIG_EXT_INTRA
10441   int8_t uv_angle_delta[TX_SIZES_ALL];
10442   int is_directional_mode, angle_stats_ready = 0;
10443   uint8_t directional_mode_skip_mask[INTRA_MODES];
10444 #endif  // CONFIG_EXT_INTRA
10445 #if CONFIG_FILTER_INTRA
10446   int8_t dc_skipped = 1;
10447   FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL];
10448 #endif  // CONFIG_FILTER_INTRA
10449   const int intra_cost_penalty = av1_get_intra_cost_penalty(
10450       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
10451   const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
10452   int best_skip2 = 0;
10453   uint16_t ref_frame_skip_mask[2] = { 0 };
10454   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
10455 #if CONFIG_INTERINTRA
10456   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
10457   int64_t best_single_inter_rd = INT64_MAX;
10458 #endif  // CONFIG_INTERINTRA
10459   int mode_skip_start = sf->mode_skip_start + 1;
10460   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
10461   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
10462   int64_t mode_threshold[MAX_MODES];
10463   int *mode_map = tile_data->mode_map[bsize];
10464   const int mode_search_skip_flags = sf->mode_search_skip_flags;
10465 #if CONFIG_PVQ
10466   od_rollback_buffer pre_buf;
10467 #endif  // CONFIG_PVQ
10468 
10469   HandleInterModeArgs args = {
10470 #if CONFIG_MOTION_VAR
10471     { NULL },
10472     { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
10473     { NULL },
10474     { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
10475 #endif  // CONFIG_MOTION_VAR
10476     NULL,
10477     NULL,
10478     NULL,
10479     { { 0 } },
10480   };
10481 
10482   const int rows = block_size_high[bsize];
10483   const int cols = block_size_wide[bsize];
10484   int palette_ctx = 0;
10485   const MODE_INFO *above_mi = xd->above_mi;
10486   const MODE_INFO *left_mi = xd->left_mi;
10487 #if CONFIG_MOTION_VAR
10488   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
10489   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
10490   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
10491   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
10492 
10493 #if CONFIG_HIGHBITDEPTH
10494   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
10495     int len = sizeof(uint16_t);
10496     args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
10497     args.above_pred_buf[1] =
10498         CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
10499     args.above_pred_buf[2] =
10500         CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
10501     args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
10502     args.left_pred_buf[1] =
10503         CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
10504     args.left_pred_buf[2] =
10505         CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
10506   } else {
10507 #endif  // CONFIG_HIGHBITDEPTH
10508     args.above_pred_buf[0] = x->above_pred_buf;
10509     args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
10510     args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
10511     args.left_pred_buf[0] = x->left_pred_buf;
10512     args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
10513     args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
10514 #if CONFIG_HIGHBITDEPTH
10515   }
10516 #endif  // CONFIG_HIGHBITDEPTH
10517 #endif  // CONFIG_MOTION_VAR
10518 
10519   av1_zero(best_mbmode);
10520 
10521   av1_zero(pmi_uv);
10522   if (try_palette) {
10523     if (above_mi)
10524       palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
10525     if (left_mi)
10526       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
10527   }
10528 
10529   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
10530                            &comp_mode_p);
10531 
10532   for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
10533   for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX;
10534   for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
10535   for (i = 0; i < MB_MODE_COUNT; ++i) {
10536     for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
10537       args.single_filter[i][k] = SWITCHABLE;
10538     }
10539   }
10540 
10541   rd_cost->rate = INT_MAX;
10542 #if CONFIG_SUPERTX
10543   *returnrate_nocoef = INT_MAX;
10544 #endif  // CONFIG_SUPERTX
10545 
10546   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
10547     x->pred_mv_sad[ref_frame] = INT_MAX;
10548     x->mbmi_ext->mode_context[ref_frame] = 0;
10549     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
10550     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
10551       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
10552       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
10553                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
10554     }
10555     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
10556 #if CONFIG_GLOBAL_MOTION
10557     frame_mv[ZEROMV][ref_frame].as_int =
10558         gm_get_motion_vector(&cm->global_motion[ref_frame],
10559                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
10560                              0
10561 #if CONFIG_AMVR
10562                              ,
10563                              cm->cur_frame_mv_precision_level
10564 #endif
10565                              )
10566             .as_int;
10567 #else   // CONFIG_GLOBAL_MOTION
10568     frame_mv[ZEROMV][ref_frame].as_int = 0;
10569 #endif  // CONFIG_GLOBAL_MOTION
10570     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
10571 #if CONFIG_COMPOUND_SINGLEREF
10572     frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
10573     frame_comp_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
10574 #endif  // CONFIG_COMPOUND_SINGLEREF
10575 #if CONFIG_GLOBAL_MOTION
10576     frame_mv[ZERO_ZEROMV][ref_frame].as_int =
10577         gm_get_motion_vector(&cm->global_motion[ref_frame],
10578                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
10579                              0
10580 #if CONFIG_AMVR
10581                              ,
10582                              cm->cur_frame_mv_precision_level
10583 #endif
10584                              )
10585             .as_int;
10586 #else   // CONFIG_GLOBAL_MOTION
10587     frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
10588 #endif  // CONFIG_GLOBAL_MOTION
10589   }
10590 
10591   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
10592     MODE_INFO *const mi = xd->mi[0];
10593     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
10594     x->mbmi_ext->mode_context[ref_frame] = 0;
10595     av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
10596                      mbmi_ext->ref_mv_stack[ref_frame],
10597                      mbmi_ext->compound_mode_context, candidates, mi_row,
10598                      mi_col, NULL, NULL, mbmi_ext->mode_context);
10599     if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
10600       MV_REFERENCE_FRAME rf[2];
10601       av1_set_ref_frame(rf, ref_frame);
10602       if (mbmi_ext->ref_mvs[rf[0]][0].as_int !=
10603               frame_mv[ZEROMV][rf[0]].as_int ||
10604           mbmi_ext->ref_mvs[rf[0]][1].as_int !=
10605               frame_mv[ZEROMV][rf[0]].as_int ||
10606           mbmi_ext->ref_mvs[rf[1]][0].as_int !=
10607               frame_mv[ZEROMV][rf[1]].as_int ||
10608           mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int)
10609         mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
10610     }
10611   }
10612 
10613 #if CONFIG_MOTION_VAR
10614   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
10615 
10616   if (check_num_overlappable_neighbors(mbmi) &&
10617       is_motion_variation_allowed_bsize(bsize)) {
10618     av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
10619                                         args.above_pred_buf, dst_width1,
10620                                         dst_height1, args.above_pred_stride);
10621     av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
10622                                        args.left_pred_buf, dst_width2,
10623                                        dst_height2, args.left_pred_stride);
10624     av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
10625                          mi_col);
10626     calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
10627                               args.above_pred_stride[0], args.left_pred_buf[0],
10628                               args.left_pred_stride[0]);
10629   }
10630 #endif  // CONFIG_MOTION_VAR
10631 
10632   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
10633     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
10634       // Skip checking missing references in both single and compound reference
10635       // modes. Note that a mode will be skipped iff both reference frames
10636       // are masked out.
10637       ref_frame_skip_mask[0] |= (1 << ref_frame);
10638       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10639     } else {
10640       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
10641         // Skip fixed mv modes for poor references
10642         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
10643           mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
10644           break;
10645         }
10646       }
10647     }
10648     // If the segment reference frame feature is enabled....
10649     // then do nothing if the current ref frame is not allowed..
10650     if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
10651         get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
10652       ref_frame_skip_mask[0] |= (1 << ref_frame);
10653       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10654     }
10655   }
10656 
10657   // Disable this drop out case if the ref frame
10658   // segment level feature is enabled for this segment. This is to
10659   // prevent the possibility that we end up unable to pick any mode.
10660   if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
10661     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
10662     // unless ARNR filtering is enabled in which case we want
10663     // an unfiltered alternative. We allow near/nearest as well
10664     // because they may result in zero-zero MVs but be cheaper.
10665     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
10666       int_mv zeromv;
10667       ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
10668 #if CONFIG_EXT_REFS
10669                                (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
10670                                (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) |
10671 #endif  // CONFIG_EXT_REFS
10672                                (1 << GOLDEN_FRAME);
10673       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
10674       // TODO(zoeliu): To further explore whether following needs to be done for
10675       //               BWDREF_FRAME as well.
10676       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
10677 #if CONFIG_GLOBAL_MOTION
10678       zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
10679                                            cm->allow_high_precision_mv, bsize,
10680                                            mi_col, mi_row, 0
10681 #if CONFIG_AMVR
10682                                            ,
10683                                            cm->cur_frame_mv_precision_level
10684 #endif
10685                                            )
10686                           .as_int;
10687 #else
10688       zeromv.as_int = 0;
10689 #endif  // CONFIG_GLOBAL_MOTION
10690       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
10691         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
10692       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
10693         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
10694       if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
10695         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
10696       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
10697         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
10698 #if CONFIG_COMPOUND_SINGLEREF
10699       if (frame_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int ||
10700           frame_comp_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int !=
10701               zeromv.as_int)
10702         mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
10703 #endif  // CONFIG_COMPOUND_SINGLEREF
10704     }
10705   }
10706 
10707   if (cpi->rc.is_src_frame_alt_ref) {
10708     if (sf->alt_ref_search_fp) {
10709       assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
10710       mode_skip_mask[ALTREF_FRAME] = 0;
10711       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
10712       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
10713     }
10714   }
10715 
10716   if (sf->alt_ref_search_fp)
10717     if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
10718       if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
10719         mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
10720 
10721   if (sf->adaptive_mode_search) {
10722     if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
10723         cpi->rc.frames_since_golden >= 3)
10724       if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
10725         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
10726   }
10727 
10728   if (bsize > sf->max_intra_bsize) {
10729     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
10730     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
10731   }
10732 
10733   mode_skip_mask[INTRA_FRAME] |=
10734       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
10735 
10736   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
10737   for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
10738     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
10739 
10740   midx = sf->schedule_mode_search ? mode_skip_start : 0;
10741   while (midx > 4) {
10742     uint8_t end_pos = 0;
10743     for (i = 5; i < midx; ++i) {
10744       if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
10745         uint8_t tmp = mode_map[i];
10746         mode_map[i] = mode_map[i - 1];
10747         mode_map[i - 1] = tmp;
10748         end_pos = i;
10749       }
10750     }
10751     midx = end_pos;
10752   }
10753 
10754   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
10755     x->use_default_intra_tx_type = 1;
10756   else
10757     x->use_default_intra_tx_type = 0;
10758 
10759   if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
10760     x->use_default_inter_tx_type = 1;
10761   else
10762     x->use_default_inter_tx_type = 0;
10763 #if CONFIG_PVQ
10764   od_encode_checkpoint(&x->daala_enc, &pre_buf);
10765 #endif  // CONFIG_PVQ
10766   for (i = 0; i < MB_MODE_COUNT; ++i)
10767     for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
10768       modelled_rd[i][ref_frame] = INT64_MAX;
10769 
10770   for (midx = 0; midx < MAX_MODES; ++midx) {
10771     int mode_index;
10772     int mode_excluded = 0;
10773     int64_t this_rd = INT64_MAX;
10774     int disable_skip = 0;
10775     int compmode_cost = 0;
10776     int rate2 = 0, rate_y = 0, rate_uv = 0;
10777     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
10778     int skippable = 0;
10779     int this_skip2 = 0;
10780     int64_t total_sse = INT64_MAX;
10781     uint8_t ref_frame_type;
10782 #if CONFIG_PVQ
10783     od_encode_rollback(&x->daala_enc, &pre_buf);
10784 #endif  // CONFIG_PVQ
10785     mode_index = mode_map[midx];
10786     this_mode = av1_mode_order[mode_index].mode;
10787     ref_frame = av1_mode_order[mode_index].ref_frame[0];
10788     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
10789     mbmi->ref_mv_idx = 0;
10790 
10791     if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
10792       // Mode must by compatible
10793       if (!is_interintra_allowed_mode(this_mode)) continue;
10794       if (!is_interintra_allowed_bsize(bsize)) continue;
10795     }
10796 
10797     if (is_inter_compound_mode(this_mode)) {
10798       frame_mv[this_mode][ref_frame].as_int =
10799           frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
10800       frame_mv[this_mode][second_ref_frame].as_int =
10801           frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
10802 #if CONFIG_COMPOUND_SINGLEREF
10803     } else if (is_inter_singleref_comp_mode(this_mode)) {
10804       frame_mv[this_mode][ref_frame].as_int =
10805           frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
10806       frame_comp_mv[this_mode][ref_frame].as_int =
10807           frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
10808 #endif  // CONFIG_COMPOUND_SINGLEREF
10809     }
10810 
10811     // Look at the reference frame of the best mode so far and set the
10812     // skip mask to look at a subset of the remaining modes.
10813     if (midx == mode_skip_start && best_mode_index >= 0) {
10814       switch (best_mbmode.ref_frame[0]) {
10815         case INTRA_FRAME: break;
10816         case LAST_FRAME:
10817           ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
10818           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10819           break;
10820 #if CONFIG_EXT_REFS
10821         case LAST2_FRAME:
10822           ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
10823           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10824           break;
10825         case LAST3_FRAME:
10826           ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
10827           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10828           break;
10829 #endif  // CONFIG_EXT_REFS
10830         case GOLDEN_FRAME:
10831           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
10832           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10833           break;
10834 #if CONFIG_EXT_REFS
10835         case BWDREF_FRAME:
10836           ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
10837           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10838           break;
10839         case ALTREF2_FRAME:
10840           ref_frame_skip_mask[0] |= ALTREF2_FRAME_MODE_MASK;
10841           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10842           break;
10843 #endif  // CONFIG_EXT_REFS
10844         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
10845 #if CONFIG_EXT_REFS
10846           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
10847 #endif  // CONFIG_EXT_REFS
10848           break;
10849         case NONE_FRAME:
10850         case TOTAL_REFS_PER_FRAME:
10851           assert(0 && "Invalid Reference frame");
10852           break;
10853       }
10854     }
10855 
10856     if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
10857         (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
10858       continue;
10859 
10860 #if CONFIG_EXT_COMP_REFS
10861 // TODO(zoeliu): Following toggle between #if 0/1 and the bug will manifest
10862 // itself.
10863 #if 0
10864     if (!(cpi->ref_frame_flags & flag_list[ref_frame]) ||
10865         (second_ref_frame > INTRA_FRAME &&
10866          (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))))
10867       printf("Frame=%d, bsize=%d, (mi_row,mi_col)=(%d,%d), ref_frame=%d, "
10868              "second_ref_frame=%d\n", cm->current_video_frame, bsize, mi_row,
10869              mi_col, ref_frame, second_ref_frame);
10870 
10871     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
10872     if (second_ref_frame > INTRA_FRAME &&
10873         (!(cpi->ref_frame_flags & flag_list[second_ref_frame])))
10874       continue;
10875 #endif  // 0
10876 
10877 #if !USE_UNI_COMP_REFS
10878     // NOTE(zoeliu): Temporarily disable uni-directional comp refs
10879     if (second_ref_frame > INTRA_FRAME) {
10880       if (!((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)))
10881         continue;
10882     }
10883     assert(second_ref_frame <= INTRA_FRAME ||
10884            ((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)));
10885 #endif  // !USE_UNI_COMP_REFS
10886 #endif  // CONFIG_EXT_COMP_REFS
10887 
10888     if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
10889 
10890     // Test best rd so far against threshold for trying this mode.
10891     if (best_mode_skippable && sf->schedule_mode_search)
10892       mode_threshold[mode_index] <<= 1;
10893 
10894     if (best_rd < mode_threshold[mode_index]) continue;
10895 
10896     // This is only used in motion vector unit test.
10897     if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
10898 
10899 #if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS  // Changes LL bitstream
10900 #if CONFIG_EXT_REFS
10901     if (cpi->oxcf.pass == 0) {
10902       // Complexity-compression trade-offs
10903       // if (ref_frame == ALTREF_FRAME) continue;
10904       // if (ref_frame == BWDREF_FRAME) continue;
10905       if (second_ref_frame == ALTREF_FRAME) continue;
10906       // if (second_ref_frame == BWDREF_FRAME) continue;
10907     }
10908 #endif  // CONFIG_EXT_REFS
10909 #endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
10910     comp_pred = second_ref_frame > INTRA_FRAME;
10911     if (comp_pred) {
10912       if (!cpi->allow_comp_inter_inter) continue;
10913 
10914       // Skip compound inter modes if ARF is not available.
10915       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
10916 
10917       // Do not allow compound prediction if the segment level reference frame
10918       // feature is in use as in this case there can only be one reference.
10919       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
10920 
10921       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
10922           best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
10923         continue;
10924 
10925       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
10926     } else {
10927       if (ref_frame != INTRA_FRAME)
10928         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
10929     }
10930 
10931     if (ref_frame == INTRA_FRAME) {
10932       if (sf->adaptive_mode_search)
10933         if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
10934           continue;
10935 
10936       if (this_mode != DC_PRED) {
10937         // Disable intra modes other than DC_PRED for blocks with low variance
10938         // Threshold for intra skipping based on source variance
10939         // TODO(debargha): Specialize the threshold for super block sizes
10940         const unsigned int skip_intra_var_thresh = 64;
10941         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
10942             x->source_variance < skip_intra_var_thresh)
10943           continue;
10944         // Only search the oblique modes if the best so far is
10945         // one of the neighboring directional modes
10946         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
10947             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
10948           if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
10949             continue;
10950         }
10951         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
10952           if (conditional_skipintra(this_mode, best_intra_mode)) continue;
10953         }
10954       }
10955 #if CONFIG_GLOBAL_MOTION
10956     } else if (cm->global_motion[ref_frame].wmtype == IDENTITY &&
10957                (!comp_pred ||
10958                 cm->global_motion[second_ref_frame].wmtype == IDENTITY)) {
10959 #else   // CONFIG_GLOBAL_MOTION
10960     } else {
10961 #endif  // CONFIG_GLOBAL_MOTION
10962       const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
10963       if (!check_best_zero_mv(cpi, x, mbmi_ext->mode_context,
10964                               mbmi_ext->compound_mode_context, frame_mv,
10965                               this_mode, ref_frames, bsize, -1, mi_row, mi_col))
10966         continue;
10967     }
10968 
10969     mbmi->mode = this_mode;
10970     mbmi->uv_mode = UV_DC_PRED;
10971     mbmi->ref_frame[0] = ref_frame;
10972     mbmi->ref_frame[1] = second_ref_frame;
10973     pmi->palette_size[0] = 0;
10974     pmi->palette_size[1] = 0;
10975 #if CONFIG_FILTER_INTRA
10976     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
10977     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
10978 #endif  // CONFIG_FILTER_INTRA
10979         // Evaluate all sub-pel filters irrespective of whether we can use
10980         // them for this frame.
10981 
10982     set_default_interp_filters(mbmi, cm->interp_filter);
10983 
10984     mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
10985     mbmi->motion_mode = SIMPLE_TRANSLATION;
10986 
10987     x->skip = 0;
10988     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
10989 
10990     // Select prediction reference frames.
10991     for (i = 0; i < MAX_MB_PLANE; i++) {
10992       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
10993       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
10994     }
10995 
10996 #if CONFIG_COMPOUND_SINGLEREF
10997     // Single ref compound mode
10998     if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
10999       xd->block_refs[1] = xd->block_refs[0];
11000       for (i = 0; i < MAX_MB_PLANE; i++)
11001         xd->plane[i].pre[1] = xd->plane[i].pre[0];
11002     }
11003 #endif  // CONFIG_COMPOUND_SINGLEREF
11004 
11005 #if CONFIG_INTERINTRA
11006     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
11007 #endif  // CONFIG_INTERINTRA
11008 
11009     if (ref_frame == INTRA_FRAME) {
11010       RD_STATS rd_stats_y;
11011       TX_SIZE uv_tx;
11012       struct macroblockd_plane *const pd = &xd->plane[1];
11013 #if CONFIG_EXT_INTRA
11014       is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
11015       if (is_directional_mode && av1_use_angle_delta(bsize)) {
11016         int rate_dummy;
11017         int64_t model_rd = INT64_MAX;
11018         if (!angle_stats_ready) {
11019           const int src_stride = x->plane[0].src.stride;
11020           const uint8_t *src = x->plane[0].src.buf;
11021 #if CONFIG_HIGHBITDEPTH
11022           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
11023             highbd_angle_estimation(src, src_stride, rows, cols, bsize,
11024                                     directional_mode_skip_mask);
11025           else
11026 #endif  // CONFIG_HIGHBITDEPTH
11027             angle_estimation(src, src_stride, rows, cols, bsize,
11028                              directional_mode_skip_mask);
11029           angle_stats_ready = 1;
11030         }
11031         if (directional_mode_skip_mask[mbmi->mode]) continue;
11032         rd_stats_y.rate = INT_MAX;
11033         rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize,
11034                                 intra_mode_cost[mbmi->mode], best_rd,
11035                                 &model_rd);
11036       } else {
11037         mbmi->angle_delta[0] = 0;
11038         super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
11039       }
11040 #else
11041       super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
11042 #endif  // CONFIG_EXT_INTRA
11043       rate_y = rd_stats_y.rate;
11044       distortion_y = rd_stats_y.dist;
11045       skippable = rd_stats_y.skip;
11046 
11047       if (rate_y == INT_MAX) continue;
11048 
11049 #if CONFIG_FILTER_INTRA
11050       if (mbmi->mode == DC_PRED) dc_skipped = 0;
11051 #endif  // CONFIG_FILTER_INTRA
11052 
11053       uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
11054                               [pd->subsampling_y];
11055       if (rate_uv_intra[uv_tx] == INT_MAX) {
11056         choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
11057                              &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
11058                              &skip_uvs[uv_tx], &mode_uv[uv_tx]);
11059         if (try_palette) pmi_uv[uv_tx] = *pmi;
11060 
11061 #if CONFIG_EXT_INTRA
11062         uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
11063 #endif  // CONFIG_EXT_INTRA
11064 #if CONFIG_FILTER_INTRA
11065         filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
11066 #endif  // CONFIG_FILTER_INTRA
11067       }
11068 
11069       rate_uv = rate_uv_tokenonly[uv_tx];
11070       distortion_uv = dist_uvs[uv_tx];
11071       skippable = skippable && skip_uvs[uv_tx];
11072       mbmi->uv_mode = mode_uv[uv_tx];
11073       if (try_palette) {
11074         pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
11075         memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
11076                pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
11077                2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
11078       }
11079 
11080 #if CONFIG_EXT_INTRA
11081       mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
11082 #endif  // CONFIG_EXT_INTRA
11083 #if CONFIG_FILTER_INTRA
11084       mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
11085           filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
11086       if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
11087         mbmi->filter_intra_mode_info.filter_intra_mode[1] =
11088             filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
11089       }
11090 #endif  // CONFIG_FILTER_INTRA
11091 
11092 #if CONFIG_CB4X4
11093       rate2 = rate_y + intra_mode_cost[mbmi->mode];
11094       if (!x->skip_chroma_rd)
11095         rate2 += rate_uv + x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
11096 #else
11097       rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
11098               x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
11099 #endif  // CONFIG_CB4X4
11100 
11101       if (try_palette && mbmi->mode == DC_PRED) {
11102         rate2 += av1_cost_bit(
11103             av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
11104       }
11105 
11106       if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
11107         // super_block_yrd above includes the cost of the tx_size in the
11108         // tokenonly rate, but for intra blocks, tx_size is always coded
11109         // (prediction granularity), so we account for it in the full rate,
11110         // not the tokenonly rate.
11111         rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
11112       }
11113 #if CONFIG_EXT_INTRA
11114       if (is_directional_mode) {
11115 #if CONFIG_INTRA_INTERP
11116         const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
11117         const int p_angle =
11118             mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
11119         if (av1_is_intra_filter_switchable(p_angle))
11120           rate2 += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
11121 #endif  // CONFIG_INTRA_INTERP
11122         if (av1_use_angle_delta(bsize)) {
11123           rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
11124                                       MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
11125         }
11126       }
11127       if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
11128           av1_use_angle_delta(bsize)) {
11129         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
11130                                     MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
11131       }
11132 #endif  // CONFIG_EXT_INTRA
11133 #if CONFIG_FILTER_INTRA
11134       if (mbmi->mode == DC_PRED) {
11135         rate2 +=
11136             av1_cost_bit(cm->fc->filter_intra_probs[0],
11137                          mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
11138         if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
11139           rate2 += write_uniform_cost(
11140               FILTER_INTRA_MODES,
11141               mbmi->filter_intra_mode_info.filter_intra_mode[0]);
11142         }
11143       }
11144       if (mbmi->uv_mode == UV_DC_PRED) {
11145         rate2 +=
11146             av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
11147                          mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
11148         if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
11149           rate2 += write_uniform_cost(
11150               FILTER_INTRA_MODES,
11151               mbmi->filter_intra_mode_info.filter_intra_mode[1]);
11152       }
11153 #endif  // CONFIG_FILTER_INTRA
11154       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
11155         rate2 += intra_cost_penalty;
11156       distortion2 = distortion_y + distortion_uv;
11157     } else {
11158       int_mv backup_ref_mv[2];
11159 
11160       if (!is_comp_ref_allowed(bsize) && mbmi->ref_frame[1] > INTRA_FRAME)
11161         continue;
11162 
11163       backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
11164       if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
11165 #if CONFIG_INTERINTRA
11166       if (second_ref_frame == INTRA_FRAME) {
11167         if (best_single_inter_ref != ref_frame) continue;
11168         mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
11169 // TODO(debargha|geza.lore):
11170 // Should we use ext_intra modes for interintra?
11171 #if CONFIG_EXT_INTRA
11172         mbmi->angle_delta[0] = 0;
11173         mbmi->angle_delta[1] = 0;
11174 #if CONFIG_INTRA_INTERP
11175         mbmi->intra_filter = INTRA_FILTER_LINEAR;
11176 #endif  // CONFIG_INTRA_INTERP
11177 #endif  // CONFIG_EXT_INTRA
11178 #if CONFIG_FILTER_INTRA
11179         mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
11180         mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
11181 #endif  // CONFIG_FILTER_INTRA
11182       }
11183 #endif  // CONFIG_INTERINTRA
11184       mbmi->ref_mv_idx = 0;
11185       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
11186 
11187       if (comp_pred) {
11188         if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
11189           int ref_mv_idx = 0;
11190           // Special case: NEAR_NEWMV and NEW_NEARMV modes use
11191           // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
11192           // mbmi->ref_mv_idx (like NEWMV)
11193           if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
11194             ref_mv_idx = 1;
11195 
11196           if (compound_ref0_mode(mbmi->mode) == NEWMV) {
11197             int_mv this_mv =
11198                 mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
11199             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11200                          xd->n8_h << MI_SIZE_LOG2, xd);
11201             mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
11202           }
11203           if (compound_ref1_mode(mbmi->mode) == NEWMV) {
11204             int_mv this_mv =
11205                 mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
11206             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11207                          xd->n8_h << MI_SIZE_LOG2, xd);
11208             mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
11209           }
11210         }
11211 #if CONFIG_COMPOUND_SINGLEREF
11212       } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
11213         if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
11214           // TODO(zoeliu): To further investigate which ref_mv_idx should be
11215           //               chosen for the mode of SR_NEAR_NEWMV.
11216           int ref_mv_idx = 0;
11217           // Special case: SR_NEAR_NEWMV mode use
11218           // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
11219           // mbmi->ref_mv_idx (like NEWMV)
11220           if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1;
11221 
11222           if (compound_ref0_mode(mbmi->mode) == NEWMV ||
11223               compound_ref1_mode(mbmi->mode) == NEWMV) {
11224             int_mv this_mv =
11225                 mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
11226             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11227                          xd->n8_h << MI_SIZE_LOG2, xd);
11228             mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
11229           }
11230         }
11231 #endif  // CONFIG_COMPOUND_SINGLEREF
11232       } else {
11233         if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
11234           int ref;
11235           for (ref = 0; ref < 1 + comp_pred; ++ref) {
11236             int_mv this_mv =
11237                 (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
11238                            : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
11239             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11240                          xd->n8_h << MI_SIZE_LOG2, xd);
11241             mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
11242           }
11243         }
11244       }
11245       {
11246         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
11247         av1_init_rd_stats(&rd_stats);
11248         rd_stats.rate = rate2;
11249 
11250         // Point to variables that are maintained between loop iterations
11251         args.single_newmv = single_newmv;
11252         args.single_newmv_rate = single_newmv_rate;
11253         args.modelled_rd = modelled_rd;
11254         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
11255                                     &rd_stats_uv, &disable_skip, frame_mv,
11256 #if CONFIG_COMPOUND_SINGLEREF
11257                                     frame_comp_mv,
11258 #endif  // CONFIG_COMPOUND_SINGLEREF
11259                                     mi_row, mi_col, &args, best_rd);
11260 
11261         rate2 = rd_stats.rate;
11262         skippable = rd_stats.skip;
11263         distortion2 = rd_stats.dist;
11264         total_sse = rd_stats.sse;
11265         rate_y = rd_stats_y.rate;
11266         rate_uv = rd_stats_uv.rate;
11267       }
11268 
11269 // TODO(jingning): This needs some refactoring to improve code quality
11270 // and reduce redundant steps.
11271 #if CONFIG_COMPOUND_SINGLEREF
11272       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
11273            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
11274           ((mbmi->mode == NEWMV || mbmi->mode == SR_NEW_NEWMV ||
11275             mbmi->mode == NEW_NEWMV) &&
11276            mbmi_ext->ref_mv_count[ref_frame_type] > 1))
11277 #else   // !CONFIG_COMPOUND_SINGLEREF
11278       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
11279            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
11280           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
11281            mbmi_ext->ref_mv_count[ref_frame_type] > 1))
11282 #endif  // CONFIG_COMPOUND_SINGLEREF
11283       {
11284         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
11285         MB_MODE_INFO backup_mbmi = *mbmi;
11286         int backup_skip = x->skip;
11287         int64_t tmp_ref_rd = this_rd;
11288         int ref_idx;
11289 
11290         // TODO(jingning): This should be deprecated shortly.
11291         int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
11292         int ref_set =
11293             AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
11294 
11295         uint8_t drl_ctx =
11296             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
11297         // Dummy
11298         int_mv backup_fmv[2];
11299         backup_fmv[0] = frame_mv[NEWMV][ref_frame];
11300         if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
11301 
11302         rate2 += (rate2 < INT_MAX ? x->drl_mode_cost0[drl_ctx][0] : 0);
11303 
11304         if (this_rd < INT64_MAX) {
11305           if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
11306               RDCOST(x->rdmult, 0, total_sse))
11307             tmp_ref_rd = RDCOST(
11308                 x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
11309                 distortion2);
11310           else
11311             tmp_ref_rd =
11312                 RDCOST(x->rdmult,
11313                        rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
11314                            rate_y - rate_uv,
11315                        total_sse);
11316         }
11317 #if CONFIG_VAR_TX
11318         for (i = 0; i < MAX_MB_PLANE; ++i)
11319           memcpy(x->blk_skip_drl[i], x->blk_skip[i],
11320                  sizeof(uint8_t) * ctx->num_4x4_blk);
11321 #endif  // CONFIG_VAR_TX
11322 
11323         for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
11324           int64_t tmp_alt_rd = INT64_MAX;
11325           int dummy_disable_skip = 0;
11326           int ref;
11327           int_mv cur_mv;
11328           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
11329 
11330           av1_invalid_rd_stats(&tmp_rd_stats);
11331 
11332           x->skip = 0;
11333 
11334           mbmi->ref_mv_idx = 1 + ref_idx;
11335 
11336           if (comp_pred) {
11337             int ref_mv_idx = mbmi->ref_mv_idx;
11338             // Special case: NEAR_NEWMV and NEW_NEARMV modes use
11339             // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
11340             // mbmi->ref_mv_idx (like NEWMV)
11341             if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
11342               ref_mv_idx = 1 + mbmi->ref_mv_idx;
11343 
11344             if (compound_ref0_mode(mbmi->mode) == NEWMV) {
11345               int_mv this_mv =
11346                   mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
11347               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11348                            xd->n8_h << MI_SIZE_LOG2, xd);
11349               mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
11350             } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) {
11351               int_mv this_mv =
11352                   mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
11353               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11354                            xd->n8_h << MI_SIZE_LOG2, xd);
11355               mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
11356             }
11357 
11358             if (compound_ref1_mode(mbmi->mode) == NEWMV) {
11359               int_mv this_mv =
11360                   mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
11361               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11362                            xd->n8_h << MI_SIZE_LOG2, xd);
11363               mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
11364             } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) {
11365               int_mv this_mv =
11366                   mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
11367               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11368                            xd->n8_h << MI_SIZE_LOG2, xd);
11369               mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
11370             }
11371 #if CONFIG_COMPOUND_SINGLEREF
11372           } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
11373             int ref_mv_idx = mbmi->ref_mv_idx;
11374             // Special case: SR_NEAR_NEWMV mode use
11375             // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
11376             // mbmi->ref_mv_idx (like NEWMV)
11377             if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
11378 
11379             // TODO(zoeliu): For the mode of SR_NEAREST_NEWMV, as it only runs
11380             //               the "if", not the "else if",
11381             //               mbmi_ext->ref_mvs[mbmi->ref_frame[0]] takes the
11382             //               value for "NEWMV", instead of "NEARESTMV".
11383             if (compound_ref0_mode(mbmi->mode) == NEWMV ||
11384                 compound_ref1_mode(mbmi->mode) == NEWMV) {
11385               int_mv this_mv =
11386                   mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
11387               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11388                            xd->n8_h << MI_SIZE_LOG2, xd);
11389               mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
11390             } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV ||
11391                        compound_ref1_mode(mbmi->mode) == NEARESTMV) {
11392               int_mv this_mv =
11393                   mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
11394               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11395                            xd->n8_h << MI_SIZE_LOG2, xd);
11396               mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
11397             }
11398 #endif  // CONFIG_COMPOUND_SINGLEREF
11399           } else {
11400             for (ref = 0; ref < 1 + comp_pred; ++ref) {
11401               int_mv this_mv =
11402                   (ref == 0)
11403                       ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
11404                             .this_mv
11405                       : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
11406                             .comp_mv;
11407               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
11408                            xd->n8_h << MI_SIZE_LOG2, xd);
11409               mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
11410             }
11411           }
11412 
11413           cur_mv =
11414               mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
11415                   .this_mv;
11416           clamp_mv2(&cur_mv.as_mv, xd);
11417 
11418           if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
11419             int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
11420             int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
11421 
11422             frame_mv[NEARMV][ref_frame] = cur_mv;
11423             av1_init_rd_stats(&tmp_rd_stats);
11424 
11425             // Point to variables that are not maintained between iterations
11426             args.single_newmv = dummy_single_newmv;
11427             args.single_newmv_rate = dummy_single_newmv_rate;
11428             args.modelled_rd = NULL;
11429             tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
11430                                            &tmp_rd_stats_y, &tmp_rd_stats_uv,
11431                                            &dummy_disable_skip, frame_mv,
11432 #if CONFIG_COMPOUND_SINGLEREF
11433                                            frame_comp_mv,
11434 #endif  // CONFIG_COMPOUND_SINGLEREF
11435                                            mi_row, mi_col, &args, best_rd);
11436             // Prevent pointers from escaping local scope
11437             args.single_newmv = NULL;
11438             args.single_newmv_rate = NULL;
11439           }
11440 
11441           for (i = 0; i < mbmi->ref_mv_idx; ++i) {
11442             uint8_t drl1_ctx = 0;
11443             drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
11444                                    i + idx_offset);
11445             tmp_rd_stats.rate +=
11446                 (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][1]
11447                                              : 0);
11448           }
11449 
11450           if (mbmi_ext->ref_mv_count[ref_frame_type] >
11451                   mbmi->ref_mv_idx + idx_offset + 1 &&
11452               ref_idx < ref_set - 1) {
11453             uint8_t drl1_ctx =
11454                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
11455                             mbmi->ref_mv_idx + idx_offset);
11456             tmp_rd_stats.rate +=
11457                 (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][0]
11458                                              : 0);
11459           }
11460 
11461           if (tmp_alt_rd < INT64_MAX) {
11462 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11463             tmp_alt_rd =
11464                 RDCOST(x->rdmult, tmp_rd_stats.rate, tmp_rd_stats.dist);
11465 #else
11466             if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
11467                        tmp_rd_stats.dist) <
11468                 RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
11469               tmp_alt_rd =
11470                   RDCOST(x->rdmult,
11471                          tmp_rd_stats.rate +
11472                              av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
11473                          tmp_rd_stats.dist);
11474             else
11475               tmp_alt_rd =
11476                   RDCOST(x->rdmult,
11477                          tmp_rd_stats.rate +
11478                              av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
11479                              tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
11480                          tmp_rd_stats.sse);
11481 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11482           }
11483 
11484           if (tmp_ref_rd > tmp_alt_rd) {
11485             rate2 = tmp_rd_stats.rate;
11486             disable_skip = dummy_disable_skip;
11487             distortion2 = tmp_rd_stats.dist;
11488             skippable = tmp_rd_stats.skip;
11489             rate_y = tmp_rd_stats_y.rate;
11490             rate_uv = tmp_rd_stats_uv.rate;
11491             total_sse = tmp_rd_stats.sse;
11492             this_rd = tmp_alt_rd;
11493             tmp_ref_rd = tmp_alt_rd;
11494             backup_mbmi = *mbmi;
11495             backup_skip = x->skip;
11496 #if CONFIG_VAR_TX
11497             for (i = 0; i < MAX_MB_PLANE; ++i)
11498               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
11499                      sizeof(uint8_t) * ctx->num_4x4_blk);
11500 #endif  // CONFIG_VAR_TX
11501           } else {
11502             *mbmi = backup_mbmi;
11503             x->skip = backup_skip;
11504           }
11505         }
11506 
11507         frame_mv[NEARMV][ref_frame] = backup_mv;
11508         frame_mv[NEWMV][ref_frame] = backup_fmv[0];
11509         if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
11510 #if CONFIG_VAR_TX
11511         for (i = 0; i < MAX_MB_PLANE; ++i)
11512           memcpy(x->blk_skip[i], x->blk_skip_drl[i],
11513                  sizeof(uint8_t) * ctx->num_4x4_blk);
11514 #endif  // CONFIG_VAR_TX
11515       }
11516       mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
11517       if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
11518 
11519       if (this_rd == INT64_MAX) continue;
11520 
11521       if (is_comp_ref_allowed(mbmi->sb_type))
11522         compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
11523 
11524       if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
11525     }
11526 
11527     // Estimate the reference frame signaling cost and add it
11528     // to the rolling cost variable.
11529     if (comp_pred) {
11530 #if CONFIG_EXT_COMP_REFS
11531       rate2 += ref_costs_comp[ref_frame][second_ref_frame];
11532 #else  // !CONFIG_EXT_COMP_REFS
11533       rate2 += ref_costs_comp[ref_frame];
11534 #if CONFIG_EXT_REFS
11535       rate2 += ref_costs_comp[second_ref_frame];
11536 #endif  // CONFIG_EXT_REFS
11537 #endif  // CONFIG_EXT_COMP_REFS
11538     } else {
11539       rate2 += ref_costs_single[ref_frame];
11540     }
11541 
11542 #if CONFIG_COMPOUND_SINGLEREF
11543     // Add the cost to signal single/comp mode in single ref.
11544     if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
11545       aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
11546       rate2 += av1_cost_bit(singleref_comp_mode_p,
11547                             is_inter_singleref_comp_mode(mbmi->mode));
11548     }
11549 #endif  // CONFIG_COMPOUND_SINGLEREF
11550 
11551 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11552     if (ref_frame == INTRA_FRAME)
11553 #else
11554     if (!disable_skip)
11555 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11556     {
11557       if (skippable) {
11558         // Back out the coefficient coding costs
11559         rate2 -= (rate_y + rate_uv);
11560         rate_y = 0;
11561         rate_uv = 0;
11562         // Cost the skip mb case
11563         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
11564       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
11565         if (RDCOST(x->rdmult, rate_y + rate_uv + rate_skip0, distortion2) <
11566             RDCOST(x->rdmult, rate_skip1, total_sse)) {
11567           // Add in the cost of the no skip flag.
11568           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
11569         } else {
11570           // FIXME(rbultje) make this work for splitmv also
11571           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
11572           distortion2 = total_sse;
11573           assert(total_sse >= 0);
11574           rate2 -= (rate_y + rate_uv);
11575           this_skip2 = 1;
11576           rate_y = 0;
11577           rate_uv = 0;
11578         }
11579       } else {
11580         // Add in the cost of the no skip flag.
11581         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
11582       }
11583 
11584       // Calculate the final RD estimate for this mode.
11585       this_rd = RDCOST(x->rdmult, rate2, distortion2);
11586 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11587     } else {
11588       this_skip2 = mbmi->skip;
11589       this_rd = RDCOST(x->rdmult, rate2, distortion2);
11590       if (this_skip2) {
11591         rate_y = 0;
11592         rate_uv = 0;
11593       }
11594 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11595     }
11596 
11597     if (ref_frame == INTRA_FRAME) {
11598       // Keep record of best intra rd
11599       if (this_rd < best_intra_rd) {
11600         best_intra_rd = this_rd;
11601         best_intra_mode = mbmi->mode;
11602       }
11603 #if CONFIG_INTERINTRA
11604     } else if (second_ref_frame == NONE_FRAME) {
11605       if (this_rd < best_single_inter_rd) {
11606         best_single_inter_rd = this_rd;
11607         best_single_inter_ref = mbmi->ref_frame[0];
11608       }
11609 #endif  // CONFIG_INTERINTRA
11610     }
11611 
11612     if (!disable_skip && ref_frame == INTRA_FRAME) {
11613       for (i = 0; i < REFERENCE_MODES; ++i)
11614         best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
11615     }
11616 
11617     // Did this mode help.. i.e. is it the new best mode
11618     if (this_rd < best_rd || x->skip) {
11619       if (!mode_excluded) {
11620         // Note index of best mode so far
11621         best_mode_index = mode_index;
11622 
11623         if (ref_frame == INTRA_FRAME) {
11624           /* required for left and above block mv */
11625           mbmi->mv[0].as_int = 0;
11626         } else {
11627           best_pred_sse = x->pred_sse[ref_frame];
11628         }
11629 
11630         rd_cost->rate = rate2;
11631 #if CONFIG_SUPERTX
11632         if (x->skip)
11633           *returnrate_nocoef = rate2;
11634         else
11635           *returnrate_nocoef = rate2 - rate_y - rate_uv;
11636         *returnrate_nocoef -= av1_cost_bit(
11637             av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
11638         *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
11639                                            mbmi->ref_frame[0] != INTRA_FRAME);
11640 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11641 #if CONFIG_WARPED_MOTION
11642         set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
11643 #endif
11644 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
11645         MODE_INFO *const mi = xd->mi[0];
11646         const MOTION_MODE motion_allowed = motion_mode_allowed(
11647 #if CONFIG_GLOBAL_MOTION
11648             0, xd->global_motion,
11649 #endif  // CONFIG_GLOBAL_MOTION
11650 #if CONFIG_WARPED_MOTION
11651             xd,
11652 #endif
11653             mi);
11654         if (motion_allowed == WARPED_CAUSAL)
11655           *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
11656         else if (motion_allowed == OBMC_CAUSAL)
11657           *returnrate_nocoef -= x->motion_mode_cost1[bsize][mbmi->motion_mode];
11658 #else
11659         *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
11660 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
11661 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
11662 #endif  // CONFIG_SUPERTX
11663         rd_cost->dist = distortion2;
11664         rd_cost->rdcost = this_rd;
11665         best_rd = this_rd;
11666         best_mbmode = *mbmi;
11667         best_skip2 = this_skip2;
11668         best_mode_skippable = skippable;
11669         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
11670                                             this_skip2 || skippable);
11671         best_rate_uv = rate_uv;
11672 #if CONFIG_VAR_TX
11673         for (i = 0; i < MAX_MB_PLANE; ++i)
11674           memcpy(ctx->blk_skip[i], x->blk_skip[i],
11675                  sizeof(uint8_t) * ctx->num_4x4_blk);
11676 #endif  // CONFIG_VAR_TX
11677       }
11678     }
11679 
11680     /* keep record of best compound/single-only prediction */
11681     if (!disable_skip && ref_frame != INTRA_FRAME) {
11682       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
11683 
11684       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
11685         single_rate = rate2 - compmode_cost;
11686         hybrid_rate = rate2;
11687       } else {
11688         single_rate = rate2;
11689         hybrid_rate = rate2 + compmode_cost;
11690       }
11691 
11692       single_rd = RDCOST(x->rdmult, single_rate, distortion2);
11693       hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
11694 
11695       if (!comp_pred) {
11696         if (single_rd < best_pred_rd[SINGLE_REFERENCE])
11697           best_pred_rd[SINGLE_REFERENCE] = single_rd;
11698       } else {
11699         if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
11700           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
11701       }
11702       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
11703         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
11704     }
11705 
11706     if (x->skip && !comp_pred) break;
11707   }
11708 
11709   if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
11710       ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
11711         is_inter_mode(best_mbmode.mode)) ||
11712        (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
11713         !is_inter_mode(best_mbmode.mode)))) {
11714     int skip_blk = 0;
11715     RD_STATS rd_stats_y, rd_stats_uv;
11716 
11717     x->use_default_inter_tx_type = 0;
11718     x->use_default_intra_tx_type = 0;
11719 
11720     *mbmi = best_mbmode;
11721 
11722     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
11723 
11724     // Select prediction reference frames.
11725     for (i = 0; i < MAX_MB_PLANE; i++) {
11726       xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
11727       if (has_second_ref(mbmi))
11728         xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
11729     }
11730 
11731 #if CONFIG_COMPOUND_SINGLEREF
11732     // Single ref compound mode
11733     if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
11734       xd->block_refs[1] = xd->block_refs[0];
11735       for (i = 0; i < MAX_MB_PLANE; i++)
11736         xd->plane[i].pre[1] = xd->plane[i].pre[0];
11737     }
11738 #endif  // CONFIG_COMPOUND_SINGLEREF
11739 
11740     if (is_inter_mode(mbmi->mode)) {
11741       av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
11742 #if CONFIG_MOTION_VAR
11743       if (mbmi->motion_mode == OBMC_CAUSAL) {
11744         av1_build_obmc_inter_prediction(
11745             cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
11746             args.left_pred_buf, args.left_pred_stride);
11747       }
11748 #endif  // CONFIG_MOTION_VAR
11749       av1_subtract_plane(x, bsize, 0);
11750 #if CONFIG_VAR_TX
11751       if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
11752         select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
11753         assert(rd_stats_y.rate != INT_MAX);
11754       } else {
11755         int idx, idy;
11756         super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
11757         for (idy = 0; idy < xd->n8_h; ++idy)
11758           for (idx = 0; idx < xd->n8_w; ++idx)
11759             mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
11760         memset(x->blk_skip[0], rd_stats_y.skip,
11761                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
11762       }
11763 
11764       inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
11765 #else
11766       super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
11767       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
11768 #endif  // CONFIG_VAR_TX
11769     } else {
11770       super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
11771       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
11772     }
11773 
11774     if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
11775                (rd_stats_y.dist + rd_stats_uv.dist)) >
11776         RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
11777       skip_blk = 1;
11778       rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
11779       rd_stats_uv.rate = 0;
11780       rd_stats_y.dist = rd_stats_y.sse;
11781       rd_stats_uv.dist = rd_stats_uv.sse;
11782     } else {
11783       skip_blk = 0;
11784       rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
11785     }
11786 
11787     if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
11788         RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
11789                (rd_stats_y.dist + rd_stats_uv.dist))) {
11790 #if CONFIG_VAR_TX
11791       int idx, idy;
11792 #endif  // CONFIG_VAR_TX
11793       best_mbmode.tx_type = mbmi->tx_type;
11794       best_mbmode.tx_size = mbmi->tx_size;
11795 #if CONFIG_LGT_FROM_PRED
11796       best_mbmode.use_lgt = mbmi->use_lgt;
11797 #endif
11798 #if CONFIG_VAR_TX
11799       for (idy = 0; idy < xd->n8_h; ++idy)
11800         for (idx = 0; idx < xd->n8_w; ++idx)
11801           best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
11802 
11803       for (i = 0; i < MAX_MB_PLANE; ++i)
11804         memcpy(ctx->blk_skip[i], x->blk_skip[i],
11805                sizeof(uint8_t) * ctx->num_4x4_blk);
11806 
11807       best_mbmode.min_tx_size = mbmi->min_tx_size;
11808 #endif  // CONFIG_VAR_TX
11809       rd_cost->rate +=
11810           (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
11811       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
11812       rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
11813       best_skip2 = skip_blk;
11814     }
11815   }
11816 
11817   // Only try palette mode when the best mode so far is an intra mode.
11818   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
11819     int rate2 = 0;
11820 #if CONFIG_SUPERTX
11821     int best_rate_nocoef;
11822 #endif  // CONFIG_SUPERTX
11823     int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
11824             best_model_rd_palette = INT64_MAX;
11825     int skippable = 0, rate_overhead_palette = 0;
11826     RD_STATS rd_stats_y;
11827     TX_SIZE uv_tx;
11828     uint8_t *const best_palette_color_map =
11829         x->palette_buffer->best_palette_color_map;
11830     uint8_t *const color_map = xd->plane[0].color_index_map;
11831     MB_MODE_INFO best_mbmi_palette = best_mbmode;
11832 
11833     mbmi->mode = DC_PRED;
11834     mbmi->uv_mode = UV_DC_PRED;
11835     mbmi->ref_frame[0] = INTRA_FRAME;
11836     mbmi->ref_frame[1] = NONE_FRAME;
11837     rate_overhead_palette = rd_pick_palette_intra_sby(
11838         cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
11839         &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
11840         &best_model_rd_palette, NULL, NULL, NULL, NULL);
11841     if (pmi->palette_size[0] == 0) goto PALETTE_EXIT;
11842     memcpy(color_map, best_palette_color_map,
11843            rows * cols * sizeof(best_palette_color_map[0]));
11844     super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
11845     if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT;
11846     uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
11847                             [xd->plane[1].subsampling_y];
11848     if (rate_uv_intra[uv_tx] == INT_MAX) {
11849       choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
11850                            &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
11851                            &skip_uvs[uv_tx], &mode_uv[uv_tx]);
11852       pmi_uv[uv_tx] = *pmi;
11853 #if CONFIG_EXT_INTRA
11854       uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
11855 #endif  // CONFIG_EXT_INTRA
11856 #if CONFIG_FILTER_INTRA
11857       filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
11858 #endif  // CONFIG_FILTER_INTRA
11859     }
11860     mbmi->uv_mode = mode_uv[uv_tx];
11861     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
11862     if (pmi->palette_size[1] > 0) {
11863       memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
11864              pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
11865              2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
11866     }
11867 #if CONFIG_EXT_INTRA
11868     mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
11869 #endif  // CONFIG_EXT_INTRA
11870 #if CONFIG_FILTER_INTRA
11871     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
11872         filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
11873     if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
11874       mbmi->filter_intra_mode_info.filter_intra_mode[1] =
11875           filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
11876     }
11877 #endif  // CONFIG_FILTER_INTRA
11878     skippable = rd_stats_y.skip && skip_uvs[uv_tx];
11879     distortion2 = rd_stats_y.dist + dist_uvs[uv_tx];
11880     rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx];
11881     rate2 += ref_costs_single[INTRA_FRAME];
11882 
11883     if (skippable) {
11884       rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
11885 #if CONFIG_SUPERTX
11886       best_rate_nocoef = rate2;
11887 #endif  // CONFIG_SUPERTX
11888       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
11889     } else {
11890 #if CONFIG_SUPERTX
11891       best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
11892 #endif  // CONFIG_SUPERTX
11893       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
11894     }
11895     this_rd = RDCOST(x->rdmult, rate2, distortion2);
11896     if (this_rd < best_rd) {
11897       best_mode_index = 3;
11898       mbmi->mv[0].as_int = 0;
11899       rd_cost->rate = rate2;
11900 #if CONFIG_SUPERTX
11901       *returnrate_nocoef = best_rate_nocoef;
11902 #endif  // CONFIG_SUPERTX
11903       rd_cost->dist = distortion2;
11904       rd_cost->rdcost = this_rd;
11905       best_rd = this_rd;
11906       best_mbmode = *mbmi;
11907       best_skip2 = 0;
11908       best_mode_skippable = skippable;
11909     }
11910   }
11911 PALETTE_EXIT:
11912 
11913 #if CONFIG_FILTER_INTRA
11914   // TODO(huisu): filter-intra is turned off in lossless mode for now to
11915   // avoid a unit test failure
11916   if (!xd->lossless[mbmi->segment_id] && pmi->palette_size[0] == 0 &&
11917       !dc_skipped && best_mode_index >= 0 &&
11918       best_intra_rd < (best_rd + (best_rd >> 3))) {
11919     pick_filter_intra_interframe(
11920         cpi, x, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
11921         dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
11922 #if CONFIG_EXT_INTRA
11923         uv_angle_delta,
11924 #endif  // CONFIG_EXT_INTRA
11925         pmi_uv, palette_ctx, 0, ref_costs_single, &best_rd, &best_intra_rd,
11926         &best_intra_mode, &best_mode_index, &best_skip2, &best_mode_skippable,
11927 #if CONFIG_SUPERTX
11928         returnrate_nocoef,
11929 #endif  // CONFIG_SUPERTX
11930         best_pred_rd, &best_mbmode, rd_cost);
11931   }
11932 #endif  // CONFIG_FILTER_INTRA
11933 
11934 // The inter modes' rate costs are not calculated precisely in some cases.
11935 // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
11936 // ZEROMV. Here, checks are added for those cases, and the mode decisions
11937 // are corrected.
11938 #if CONFIG_COMPOUND_SINGLEREF
11939 // NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
11940 //       are surely different from each other.
11941 #endif  // CONFIG_COMPOUND_SINGLEREF
11942   if (best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV) {
11943     const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
11944                                          best_mbmode.ref_frame[1] };
11945     int comp_pred_mode = refs[1] > INTRA_FRAME;
11946     int_mv zeromv[2];
11947     const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
11948 #if CONFIG_GLOBAL_MOTION
11949     zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
11950                                             cm->allow_high_precision_mv, bsize,
11951                                             mi_col, mi_row, 0
11952 #if CONFIG_AMVR
11953                                             ,
11954                                             cm->cur_frame_mv_precision_level
11955 #endif
11956                                             )
11957                            .as_int;
11958     zeromv[1].as_int =
11959         comp_pred_mode
11960             ? gm_get_motion_vector(&cm->global_motion[refs[1]],
11961                                    cm->allow_high_precision_mv, bsize, mi_col,
11962                                    mi_row, 0
11963 #if CONFIG_AMVR
11964                                    ,
11965                                    cm->cur_frame_mv_precision_level
11966 #endif
11967                                    )
11968                   .as_int
11969             : 0;
11970 #else
11971     zeromv[0].as_int = 0;
11972     zeromv[1].as_int = 0;
11973 #endif  // CONFIG_GLOBAL_MOTION
11974     if (!comp_pred_mode) {
11975       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
11976                         ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
11977                         : INT_MAX;
11978 
11979       for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
11980         int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
11981         if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
11982           best_mbmode.mode = NEARMV;
11983           best_mbmode.ref_mv_idx = i;
11984         }
11985       }
11986 
11987       if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
11988         best_mbmode.mode = NEARESTMV;
11989       else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
11990         best_mbmode.mode = ZEROMV;
11991     } else {
11992       int_mv nearestmv[2];
11993       int_mv nearmv[2];
11994 
11995       if (mbmi_ext->ref_mv_count[rf_type] > 1) {
11996         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
11997         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
11998       } else {
11999         nearmv[0] = frame_mv[NEARMV][refs[0]];
12000         nearmv[1] = frame_mv[NEARMV][refs[1]];
12001       }
12002       if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
12003         nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
12004         nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
12005       } else {
12006         nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
12007         nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
12008       }
12009 
12010       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
12011           nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
12012         best_mbmode.mode = NEAREST_NEARESTMV;
12013       } else {
12014         int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
12015                           ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
12016                           : INT_MAX;
12017 
12018         for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
12019           nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
12020           nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
12021 
12022           // Try switching to the NEAR_NEARMV mode
12023           if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
12024               nearmv[1].as_int == best_mbmode.mv[1].as_int) {
12025             best_mbmode.mode = NEAR_NEARMV;
12026             best_mbmode.ref_mv_idx = i;
12027           }
12028         }
12029 
12030         if (best_mbmode.mode == NEW_NEWMV &&
12031             best_mbmode.mv[0].as_int == zeromv[0].as_int &&
12032             best_mbmode.mv[1].as_int == zeromv[1].as_int)
12033           best_mbmode.mode = ZERO_ZEROMV;
12034       }
12035     }
12036   }
12037 
12038   // Make sure that the ref_mv_idx is only nonzero when we're
12039   // using a mode which can support ref_mv_idx
12040   if (best_mbmode.ref_mv_idx != 0 &&
12041 #if CONFIG_COMPOUND_SINGLEREF
12042       !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
12043         best_mbmode.mode == NEW_NEWMV ||
12044         have_nearmv_in_inter_mode(best_mbmode.mode)))
12045 #else   // !CONFIG_COMPOUND_SINGLEREF
12046       !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
12047         have_nearmv_in_inter_mode(best_mbmode.mode)))
12048 #endif  // CONFIG_COMPOUND_SINGLEREF
12049   {
12050     best_mbmode.ref_mv_idx = 0;
12051   }
12052 
12053   if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
12054       best_mbmode.ref_frame[1] <= INTRA_FRAME) {
12055     int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
12056     int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
12057     if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
12058       int_mv zeromv;
12059 #if CONFIG_GLOBAL_MOTION
12060       const MV_REFERENCE_FRAME ref = best_mbmode.ref_frame[0];
12061       zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ref],
12062                                            cm->allow_high_precision_mv, bsize,
12063                                            mi_col, mi_row, 0
12064 #if CONFIG_AMVR
12065                                            ,
12066                                            cm->cur_frame_mv_precision_level
12067 #endif
12068                                            )
12069                           .as_int;
12070 #else
12071       zeromv.as_int = 0;
12072 #endif  // CONFIG_GLOBAL_MOTION
12073       if (best_mbmode.mv[0].as_int == zeromv.as_int) {
12074         best_mbmode.mode = ZEROMV;
12075       }
12076     }
12077   }
12078 
12079   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
12080     rd_cost->rate = INT_MAX;
12081     rd_cost->rdcost = INT64_MAX;
12082     return;
12083   }
12084 
12085   assert((cm->interp_filter == SWITCHABLE) ||
12086          (cm->interp_filter ==
12087           av1_extract_interp_filter(best_mbmode.interp_filters, 0)) ||
12088          !is_inter_block(&best_mbmode));
12089 #if CONFIG_DUAL_FILTER
12090   assert((cm->interp_filter == SWITCHABLE) ||
12091          (cm->interp_filter ==
12092           av1_extract_interp_filter(best_mbmode.interp_filters, 1)) ||
12093          !is_inter_block(&best_mbmode));
12094 #endif  // CONFIG_DUAL_FILTER
12095 
12096   if (!cpi->rc.is_src_frame_alt_ref)
12097     av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
12098                               sf->adaptive_rd_thresh, bsize, best_mode_index);
12099 
12100   // macroblock modes
12101   *mbmi = best_mbmode;
12102   x->skip |= best_skip2;
12103 
12104 // Note: this section is needed since the mode may have been forced to
12105 // ZEROMV by the all-zero mode handling of ref-mv.
12106 #if CONFIG_GLOBAL_MOTION
12107   if (mbmi->mode == ZEROMV || mbmi->mode == ZERO_ZEROMV) {
12108 #if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
12109     // Correct the motion mode for ZEROMV
12110     const MOTION_MODE last_motion_mode_allowed =
12111         motion_mode_allowed(0, xd->global_motion,
12112 #if CONFIG_WARPED_MOTION
12113                             xd,
12114 #endif
12115                             xd->mi[0]);
12116     if (mbmi->motion_mode > last_motion_mode_allowed)
12117       mbmi->motion_mode = last_motion_mode_allowed;
12118 #endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
12119 
12120     // Correct the interpolation filter for ZEROMV
12121     if (is_nontrans_global_motion(xd)) {
12122       mbmi->interp_filters = av1_broadcast_interp_filter(
12123           av1_unswitchable_filter(cm->interp_filter));
12124     }
12125   }
12126 #endif  // CONFIG_GLOBAL_MOTION
12127 
12128   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
12129     if (mbmi->mode != NEWMV)
12130       mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
12131     else
12132       mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
12133   }
12134 
12135   for (i = 0; i < REFERENCE_MODES; ++i) {
12136     if (best_pred_rd[i] == INT64_MAX)
12137       best_pred_diff[i] = INT_MIN;
12138     else
12139       best_pred_diff[i] = best_rd - best_pred_rd[i];
12140   }
12141 
12142   x->skip |= best_mode_skippable;
12143 
12144   assert(best_mode_index >= 0);
12145 
12146   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
12147                        best_mode_skippable);
12148 
12149   if (pmi->palette_size[1] > 0) {
12150     assert(try_palette);
12151     restore_uv_color_map(cpi, x);
12152   }
12153 }
12154 
12155 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
12156                                         TileDataEnc *tile_data, MACROBLOCK *x,
12157                                         int mi_row, int mi_col,
12158                                         RD_STATS *rd_cost, BLOCK_SIZE bsize,
12159                                         PICK_MODE_CONTEXT *ctx,
12160                                         int64_t best_rd_so_far) {
12161   const AV1_COMMON *const cm = &cpi->common;
12162   MACROBLOCKD *const xd = &x->e_mbd;
12163   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
12164   unsigned char segment_id = mbmi->segment_id;
12165   const int comp_pred = 0;
12166   int i;
12167   int64_t best_pred_diff[REFERENCE_MODES];
12168   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
12169 #if CONFIG_EXT_COMP_REFS
12170   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
12171 #else
12172   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
12173 #endif  // CONFIG_EXT_COMP_REFS
12174   aom_prob comp_mode_p;
12175   InterpFilter best_filter = SWITCHABLE;
12176   int64_t this_rd = INT64_MAX;
12177   int rate2 = 0;
12178   const int64_t distortion2 = 0;
12179   (void)mi_row;
12180   (void)mi_col;
12181 
12182   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
12183                            &comp_mode_p);
12184 
12185   for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
12186   for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
12187     x->pred_mv_sad[i] = INT_MAX;
12188 
12189   rd_cost->rate = INT_MAX;
12190 
12191   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
12192 
12193   mbmi->palette_mode_info.palette_size[0] = 0;
12194   mbmi->palette_mode_info.palette_size[1] = 0;
12195 
12196 #if CONFIG_FILTER_INTRA
12197   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
12198   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
12199 #endif  // CONFIG_FILTER_INTRA
12200   mbmi->mode = ZEROMV;
12201   mbmi->motion_mode = SIMPLE_TRANSLATION;
12202   mbmi->uv_mode = UV_DC_PRED;
12203   mbmi->ref_frame[0] = LAST_FRAME;
12204   mbmi->ref_frame[1] = NONE_FRAME;
12205 #if CONFIG_GLOBAL_MOTION
12206   mbmi->mv[0].as_int =
12207       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
12208                            cm->allow_high_precision_mv, bsize, mi_col, mi_row, 0
12209 #if CONFIG_AMVR
12210                            ,
12211                            cm->cur_frame_mv_precision_level
12212 #endif
12213                            )
12214           .as_int;
12215 #else   // CONFIG_GLOBAL_MOTION
12216   mbmi->mv[0].as_int = 0;
12217 #endif  // CONFIG_GLOBAL_MOTION
12218   mbmi->tx_size = max_txsize_lookup[bsize];
12219   x->skip = 1;
12220 
12221   mbmi->ref_mv_idx = 0;
12222   mbmi->pred_mv[0].as_int = 0;
12223 #if CONFIG_LGT_FROM_PRED
12224   mbmi->use_lgt = 0;
12225 #endif
12226 
12227   mbmi->motion_mode = SIMPLE_TRANSLATION;
12228 #if CONFIG_MOTION_VAR
12229   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
12230 #endif
12231 #if CONFIG_WARPED_MOTION
12232   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
12233     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
12234 #if WARPED_MOTION_SORT_SAMPLES
12235     int pts_mv[SAMPLES_ARRAY_SIZE];
12236     mbmi->num_proj_ref[0] =
12237         findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
12238     // Rank the samples by motion vector difference
12239     if (mbmi->num_proj_ref[0] > 1)
12240       mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
12241                                           pts_inref, mbmi->num_proj_ref[0]);
12242 #else
12243     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
12244 #endif  // WARPED_MOTION_SORT_SAMPLES
12245   }
12246 #endif
12247 
12248   set_default_interp_filters(mbmi, cm->interp_filter);
12249 
12250   if (cm->interp_filter != SWITCHABLE) {
12251     best_filter = cm->interp_filter;
12252   } else {
12253     best_filter = EIGHTTAP_REGULAR;
12254     if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
12255         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
12256       int rs;
12257       int best_rs = INT_MAX;
12258       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
12259         mbmi->interp_filters = av1_broadcast_interp_filter(i);
12260         rs = av1_get_switchable_rate(cm, x, xd);
12261         if (rs < best_rs) {
12262           best_rs = rs;
12263           best_filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
12264         }
12265       }
12266     }
12267   }
12268   // Set the appropriate filter
12269   mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
12270   rate2 += av1_get_switchable_rate(cm, x, xd);
12271 
12272   if (cm->reference_mode == REFERENCE_MODE_SELECT)
12273     rate2 += av1_cost_bit(comp_mode_p, comp_pred);
12274 
12275   // Estimate the reference frame signaling cost and add it
12276   // to the rolling cost variable.
12277   rate2 += ref_costs_single[LAST_FRAME];
12278   this_rd = RDCOST(x->rdmult, rate2, distortion2);
12279 
12280   rd_cost->rate = rate2;
12281   rd_cost->dist = distortion2;
12282   rd_cost->rdcost = this_rd;
12283 
12284   if (this_rd >= best_rd_so_far) {
12285     rd_cost->rate = INT_MAX;
12286     rd_cost->rdcost = INT64_MAX;
12287     return;
12288   }
12289 
12290   assert((cm->interp_filter == SWITCHABLE) ||
12291          (cm->interp_filter ==
12292           av1_extract_interp_filter(mbmi->interp_filters, 0)));
12293 
12294   av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
12295                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
12296 
12297   av1_zero(best_pred_diff);
12298 
12299   store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
12300 }
12301 
12302 #if CONFIG_MOTION_VAR
12303 
12304 struct calc_target_weighted_pred_ctxt {
12305   const MACROBLOCK *x;
12306   const uint8_t *tmp;
12307   int tmp_stride;
12308   int overlap;
12309 };
12310 
12311 static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
12312                                                    int rel_mi_col,
12313                                                    uint8_t nb_mi_width,
12314                                                    MODE_INFO *nb_mi,
12315                                                    void *fun_ctxt) {
12316   (void)nb_mi;
12317 
12318   struct calc_target_weighted_pred_ctxt *ctxt =
12319       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
12320 
12321 #if CONFIG_HIGHBITDEPTH
12322   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
12323 #else
12324   const int is_hbd = 0;
12325 #endif  // CONFIG_HIGHBITDEPTH
12326 
12327   const int bw = xd->n8_w << MI_SIZE_LOG2;
12328   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
12329 
12330   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
12331   int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
12332   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
12333 
12334   if (!is_hbd) {
12335     for (int row = 0; row < ctxt->overlap; ++row) {
12336       const uint8_t m0 = mask1d[row];
12337       const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
12338       for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
12339         wsrc[col] = m1 * tmp[col];
12340         mask[col] = m0;
12341       }
12342       wsrc += bw;
12343       mask += bw;
12344       tmp += ctxt->tmp_stride;
12345     }
12346 #if CONFIG_HIGHBITDEPTH
12347   } else {
12348     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
12349 
12350     for (int row = 0; row < ctxt->overlap; ++row) {
12351       const uint8_t m0 = mask1d[row];
12352       const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
12353       for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
12354         wsrc[col] = m1 * tmp16[col];
12355         mask[col] = m0;
12356       }
12357       wsrc += bw;
12358       mask += bw;
12359       tmp16 += ctxt->tmp_stride;
12360     }
12361 #endif  // CONFIG_HIGHBITDEPTH
12362   }
12363 }
12364 
12365 static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
12366                                                   int rel_mi_row,
12367                                                   uint8_t nb_mi_height,
12368                                                   MODE_INFO *nb_mi,
12369                                                   void *fun_ctxt) {
12370   (void)nb_mi;
12371 
12372   struct calc_target_weighted_pred_ctxt *ctxt =
12373       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
12374 
12375 #if CONFIG_HIGHBITDEPTH
12376   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
12377 #else
12378   const int is_hbd = 0;
12379 #endif  // CONFIG_HIGHBITDEPTH
12380 
12381   const int bw = xd->n8_w << MI_SIZE_LOG2;
12382   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
12383 
12384   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
12385   int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
12386   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
12387 
12388   if (!is_hbd) {
12389     for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
12390       for (int col = 0; col < ctxt->overlap; ++col) {
12391         const uint8_t m0 = mask1d[col];
12392         const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
12393         wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
12394                     (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
12395         mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
12396       }
12397       wsrc += bw;
12398       mask += bw;
12399       tmp += ctxt->tmp_stride;
12400     }
12401 #if CONFIG_HIGHBITDEPTH
12402   } else {
12403     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
12404 
12405     for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
12406       for (int col = 0; col < ctxt->overlap; ++col) {
12407         const uint8_t m0 = mask1d[col];
12408         const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
12409         wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
12410                     (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
12411         mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
12412       }
12413       wsrc += bw;
12414       mask += bw;
12415       tmp16 += ctxt->tmp_stride;
12416     }
12417 #endif  // CONFIG_HIGHBITDEPTH
12418   }
12419 }
12420 
12421 // This function has a structure similar to av1_build_obmc_inter_prediction
12422 //
12423 // The OBMC predictor is computed as:
12424 //
12425 //  PObmc(x,y) =
12426 //    AOM_BLEND_A64(Mh(x),
12427 //                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
12428 //                  PLeft(x, y))
12429 //
12430 // Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
12431 // rounding, this can be written as:
12432 //
12433 //  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
12434 //    Mh(x) * Mv(y) * P(x,y) +
12435 //      Mh(x) * Cv(y) * Pabove(x,y) +
12436 //      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
12437 //
12438 // Where :
12439 //
12440 //  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
12441 //  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
12442 //
12443 // This function computes 'wsrc' and 'mask' as:
12444 //
12445 //  wsrc(x, y) =
12446 //    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
12447 //      Mh(x) * Cv(y) * Pabove(x,y) +
12448 //      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
12449 //
12450 //  mask(x, y) = Mh(x) * Mv(y)
12451 //
12452 // These can then be used to efficiently approximate the error for any
12453 // predictor P in the context of the provided neighbouring predictors by
12454 // computing:
12455 //
12456 //  error(x, y) =
12457 //    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
12458 //
12459 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
12460                                       const MACROBLOCKD *xd, int mi_row,
12461                                       int mi_col, const uint8_t *above,
12462                                       int above_stride, const uint8_t *left,
12463                                       int left_stride) {
12464   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
12465   const int bw = xd->n8_w << MI_SIZE_LOG2;
12466   const int bh = xd->n8_h << MI_SIZE_LOG2;
12467   int32_t *mask_buf = x->mask_buf;
12468   int32_t *wsrc_buf = x->wsrc_buf;
12469 
12470   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
12471 #if CONFIG_HIGHBITDEPTH
12472   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
12473 #else
12474   const int is_hbd = 0;
12475 #endif  // CONFIG_HIGHBITDEPTH
12476 
12477   // plane 0 should not be subsampled
12478   assert(xd->plane[0].subsampling_x == 0);
12479   assert(xd->plane[0].subsampling_y == 0);
12480 
12481   av1_zero_array(wsrc_buf, bw * bh);
12482   for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
12483 
12484   // handle above row
12485   if (xd->up_available) {
12486     const int overlap =
12487         AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
12488     struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
12489                                                    overlap };
12490     foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
12491                                   max_neighbor_obmc[b_width_log2_lookup[bsize]],
12492                                   calc_target_weighted_pred_above, &ctxt);
12493   }
12494 
12495   for (int i = 0; i < bw * bh; ++i) {
12496     wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
12497     mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
12498   }
12499 
12500   // handle left column
12501   if (xd->left_available) {
12502     const int overlap =
12503         AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
12504     struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
12505                                                    overlap };
12506     foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
12507                                  max_neighbor_obmc[b_height_log2_lookup[bsize]],
12508                                  calc_target_weighted_pred_left, &ctxt);
12509   }
12510 
12511   if (!is_hbd) {
12512     const uint8_t *src = x->plane[0].src.buf;
12513 
12514     for (int row = 0; row < bh; ++row) {
12515       for (int col = 0; col < bw; ++col) {
12516         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
12517       }
12518       wsrc_buf += bw;
12519       src += x->plane[0].src.stride;
12520     }
12521 #if CONFIG_HIGHBITDEPTH
12522   } else {
12523     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
12524 
12525     for (int row = 0; row < bh; ++row) {
12526       for (int col = 0; col < bw; ++col) {
12527         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
12528       }
12529       wsrc_buf += bw;
12530       src += x->plane[0].src.stride;
12531     }
12532 #endif  // CONFIG_HIGHBITDEPTH
12533   }
12534 }
12535 
12536 #if CONFIG_NCOBMC
12537 void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
12538                          int mi_row, int mi_col) {
12539   const AV1_COMMON *const cm = &cpi->common;
12540   MACROBLOCKD *const xd = &x->e_mbd;
12541   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
12542   MB_MODE_INFO backup_mbmi;
12543   BLOCK_SIZE bsize = mbmi->sb_type;
12544   int ref, skip_blk, backup_skip = x->skip;
12545   int64_t rd_causal;
12546   RD_STATS rd_stats_y, rd_stats_uv;
12547   int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
12548   int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
12549 
12550   // Recompute the best causal predictor and rd
12551   mbmi->motion_mode = SIMPLE_TRANSLATION;
12552   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
12553   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
12554     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
12555     assert(cfg != NULL);
12556     av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
12557                          &xd->block_refs[ref]->sf);
12558   }
12559   av1_setup_dst_planes(x->e_mbd.plane, bsize,
12560                        get_frame_new_buffer(&cpi->common), mi_row, mi_col);
12561 
12562   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
12563 
12564   av1_subtract_plane(x, bsize, 0);
12565 #if CONFIG_VAR_TX
12566   if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
12567     select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12568   } else {
12569     int idx, idy;
12570     super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12571     for (idy = 0; idy < xd->n8_h; ++idy)
12572       for (idx = 0; idx < xd->n8_w; ++idx)
12573         mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
12574     memset(x->blk_skip[0], rd_stats_y.skip,
12575            sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
12576   }
12577   inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
12578 #else
12579   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12580   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
12581 #endif
12582   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
12583   if (rd_stats_y.skip && rd_stats_uv.skip) {
12584     rd_stats_y.rate = rate_skip1;
12585     rd_stats_uv.rate = 0;
12586     rd_stats_y.dist = rd_stats_y.sse;
12587     rd_stats_uv.dist = rd_stats_uv.sse;
12588     skip_blk = 0;
12589   } else if (RDCOST(x->rdmult,
12590                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
12591                     (rd_stats_y.dist + rd_stats_uv.dist)) >
12592              RDCOST(x->rdmult, rate_skip1,
12593                     (rd_stats_y.sse + rd_stats_uv.sse))) {
12594     rd_stats_y.rate = rate_skip1;
12595     rd_stats_uv.rate = 0;
12596     rd_stats_y.dist = rd_stats_y.sse;
12597     rd_stats_uv.dist = rd_stats_uv.sse;
12598     skip_blk = 1;
12599   } else {
12600     rd_stats_y.rate += rate_skip0;
12601     skip_blk = 0;
12602   }
12603   backup_skip = skip_blk;
12604   backup_mbmi = *mbmi;
12605   rd_causal = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
12606                      (rd_stats_y.dist + rd_stats_uv.dist));
12607   rd_causal +=
12608       RDCOST(x->rdmult, av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
12609 
12610   // Check non-causal mode
12611   mbmi->motion_mode = OBMC_CAUSAL;
12612   av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
12613 
12614   av1_subtract_plane(x, bsize, 0);
12615 #if CONFIG_VAR_TX
12616   if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
12617     select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12618   } else {
12619     int idx, idy;
12620     super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12621     for (idy = 0; idy < xd->n8_h; ++idy)
12622       for (idx = 0; idx < xd->n8_w; ++idx)
12623         mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
12624     memset(x->blk_skip[0], rd_stats_y.skip,
12625            sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
12626   }
12627   inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
12628 #else
12629   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12630   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
12631 #endif
12632   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
12633   if (rd_stats_y.skip && rd_stats_uv.skip) {
12634     rd_stats_y.rate = rate_skip1;
12635     rd_stats_uv.rate = 0;
12636     rd_stats_y.dist = rd_stats_y.sse;
12637     rd_stats_uv.dist = rd_stats_uv.sse;
12638     skip_blk = 0;
12639   } else if (RDCOST(x->rdmult,
12640                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
12641                     (rd_stats_y.dist + rd_stats_uv.dist)) >
12642              RDCOST(x->rdmult, rate_skip1,
12643                     (rd_stats_y.sse + rd_stats_uv.sse))) {
12644     rd_stats_y.rate = rate_skip1;
12645     rd_stats_uv.rate = 0;
12646     rd_stats_y.dist = rd_stats_y.sse;
12647     rd_stats_uv.dist = rd_stats_uv.sse;
12648     skip_blk = 1;
12649   } else {
12650     rd_stats_y.rate += rate_skip0;
12651     skip_blk = 0;
12652   }
12653 
12654   if (rd_causal >
12655       RDCOST(x->rdmult,
12656              rd_stats_y.rate + rd_stats_uv.rate +
12657                  av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
12658              (rd_stats_y.dist + rd_stats_uv.dist))) {
12659     x->skip = skip_blk;
12660   } else {
12661     *mbmi = backup_mbmi;
12662     x->skip = backup_skip;
12663   }
12664 }
12665 #endif  // CONFIG_NCOBMC
12666 
12667 int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
12668                                int mi_row, int mi_col, int *skip_blk,
12669                                MB_MODE_INFO *backup_mbmi) {
12670   const AV1_COMMON *const cm = &cpi->common;
12671   MACROBLOCKD *const xd = &x->e_mbd;
12672   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
12673   BLOCK_SIZE bsize = mbmi->sb_type;
12674 #if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
12675   const MOTION_MODE motion_allowed = motion_mode_allowed(
12676 #if CONFIG_GLOBAL_MOTION
12677       0, xd->global_motion,
12678 #endif  // CONFIG_GLOBAL_MOTION
12679 #if CONFIG_WARPED_MOTION
12680       xd,
12681 #endif
12682       xd->mi[0]);
12683 #endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
12684   RD_STATS rd_stats_y, rd_stats_uv;
12685   int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
12686   int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
12687   int64_t this_rd;
12688   int ref;
12689 
12690 #if CONFIG_CB4X4
12691   x->skip_chroma_rd =
12692       !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
12693                            xd->plane[1].subsampling_y);
12694 #endif
12695 
12696   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
12697   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
12698     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
12699     assert(cfg != NULL);
12700     av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
12701                          &xd->block_refs[ref]->sf);
12702   }
12703   av1_setup_dst_planes(x->e_mbd.plane, bsize,
12704                        get_frame_new_buffer(&cpi->common), mi_row, mi_col);
12705 
12706 #if CONFIG_NCOBMC_ADAPT_WEIGHT
12707   if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT)
12708 #endif
12709     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
12710 
12711 #if CONFIG_MOTION_VAR
12712   if (mbmi->motion_mode == OBMC_CAUSAL) {
12713 #if CONFIG_NCOBMC
12714     av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
12715 #else
12716     av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
12717 #endif
12718   }
12719 #endif  // CONFIG_MOTION_VAR
12720 
12721 #if CONFIG_NCOBMC_ADAPT_WEIGHT
12722   if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT)
12723     for (int plane = 0; plane < MAX_MB_PLANE; ++plane)
12724       get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
12725 #endif
12726   av1_subtract_plane(x, bsize, 0);
12727 
12728 #if CONFIG_VAR_TX
12729   if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
12730     select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12731   } else {
12732     int idx, idy;
12733     super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12734     for (idy = 0; idy < xd->n8_h; ++idy)
12735       for (idx = 0; idx < xd->n8_w; ++idx)
12736         mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
12737     memset(x->blk_skip[0], rd_stats_y.skip,
12738            sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
12739   }
12740   inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
12741 #else
12742   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
12743   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
12744 #endif
12745   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
12746 
12747   if (rd_stats_y.skip && rd_stats_uv.skip) {
12748     rd_stats_y.rate = rate_skip1;
12749     rd_stats_uv.rate = 0;
12750     rd_stats_y.dist = rd_stats_y.sse;
12751     rd_stats_uv.dist = rd_stats_uv.sse;
12752     *skip_blk = 1;
12753   } else if (RDCOST(x->rdmult,
12754                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
12755                     (rd_stats_y.dist + rd_stats_uv.dist)) >
12756              RDCOST(x->rdmult, rate_skip1,
12757                     (rd_stats_y.sse + rd_stats_uv.sse))) {
12758     rd_stats_y.rate = rate_skip1;
12759     rd_stats_uv.rate = 0;
12760     rd_stats_y.dist = rd_stats_y.sse;
12761     rd_stats_uv.dist = rd_stats_uv.sse;
12762     *skip_blk = 1;
12763   } else {
12764     rd_stats_y.rate += rate_skip0;
12765     *skip_blk = 0;
12766   }
12767 
12768   if (backup_mbmi) *backup_mbmi = *mbmi;
12769 
12770   this_rd = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
12771                    (rd_stats_y.dist + rd_stats_uv.dist));
12772 #if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
12773   if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
12774     assert(mbmi->motion_mode <= NCOBMC_ADAPT_WEIGHT);
12775     this_rd +=
12776         RDCOST(x->rdmult, x->motion_mode_cost2[bsize][mbmi->motion_mode], 0);
12777   } else if (motion_allowed == OBMC_CAUSAL) {
12778     assert(mbmi->motion_mode <= OBMC_CAUSAL);
12779     this_rd +=
12780         RDCOST(x->rdmult, x->motion_mode_cost1[bsize][mbmi->motion_mode], 0);
12781   } else {
12782 #endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
12783     this_rd +=
12784         RDCOST(x->rdmult, x->motion_mode_cost[bsize][mbmi->motion_mode], 0);
12785 #if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
12786   }
12787 #endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
12788   return this_rd;
12789 }
12790 
12791 #if CONFIG_NCOBMC_ADAPT_WEIGHT
12792 void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
12793                                       struct macroblock *x, int mi_row,
12794                                       int mi_col) {
12795   MACROBLOCKD *const xd = &x->e_mbd;
12796   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
12797   BLOCK_SIZE bsize = mbmi->sb_type;
12798 #if CONFIG_VAR_TX
12799   const int n4 = bsize_to_num_blk(bsize);
12800   uint8_t st_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
12801   uint8_t obmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
12802   uint8_t ncobmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
12803 #endif
12804   MB_MODE_INFO st_mbmi, obmc_mbmi, ncobmc_mbmi;
12805   int st_skip, obmc_skip, ncobmc_skip;
12806   int64_t st_rd, obmc_rd, ncobmc_rd;
12807 #if CONFIG_WARPED_MOTION
12808   const AV1_COMMON *const cm = &cpi->common;
12809   const int is_warp_motion = mbmi->motion_mode == WARPED_CAUSAL;
12810   const int rs = RDCOST(x->rdmult, av1_get_switchable_rate(cm, x, xd), 0);
12811   MB_MODE_INFO warp_mbmi;
12812   int64_t warp_rd;
12813   int warp_skip;
12814 #endif
12815 
12816   // Recompute the rd for the motion mode decided in rd loop
12817   mbmi->motion_mode = SIMPLE_TRANSLATION;
12818   st_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &st_skip, &st_mbmi);
12819 #if CONFIG_WARPED_MOTION
12820   st_rd += rs;
12821 #endif
12822 #if CONFIG_VAR_TX
12823   memcpy(st_blk_skip, x->blk_skip[0], sizeof(st_blk_skip[0]) * n4);
12824 #endif
12825 
12826   mbmi->motion_mode = OBMC_CAUSAL;
12827   obmc_rd =
12828       get_prediction_rd_cost(cpi, x, mi_row, mi_col, &obmc_skip, &obmc_mbmi);
12829 #if CONFIG_WARPED_MOTION
12830   obmc_rd += rs;
12831 #endif
12832 #if CONFIG_VAR_TX
12833   memcpy(obmc_blk_skip, x->blk_skip[0], sizeof(obmc_blk_skip[0]) * n4);
12834 #endif
12835 
12836   // Compute the rd cost for ncobmc adaptive weight
12837   mbmi->motion_mode = NCOBMC_ADAPT_WEIGHT;
12838   ncobmc_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &ncobmc_skip,
12839                                      &ncobmc_mbmi);
12840 #if CONFIG_WARPED_MOTION
12841   ncobmc_rd += rs;
12842 #endif
12843   // Calculate the ncobmc mode costs
12844   {
12845     ADAPT_OVERLAP_BLOCK aob = adapt_overlap_block_lookup[bsize];
12846     ncobmc_rd +=
12847         RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[0]], 0);
12848     if (mi_size_wide[bsize] != mi_size_high[bsize])
12849       ncobmc_rd +=
12850           RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[1]], 0);
12851   }
12852 #if CONFIG_VAR_TX
12853   memcpy(ncobmc_blk_skip, x->blk_skip[0], sizeof(ncobmc_blk_skip[0]) * n4);
12854 #endif
12855 
12856 #if CONFIG_WARPED_MOTION
12857   if (is_warp_motion) {
12858     mbmi->motion_mode = WARPED_CAUSAL;
12859     warp_rd =
12860         get_prediction_rd_cost(cpi, x, mi_row, mi_col, &warp_skip, &warp_mbmi);
12861   } else {
12862     warp_rd = INT64_MAX;
12863   }
12864 #endif
12865 
12866 #if CONFIG_WARPED_MOTION
12867   if (AOMMIN(ncobmc_rd, warp_rd) < AOMMIN(st_rd, obmc_rd)) {
12868     if (ncobmc_rd < warp_rd) {
12869       x->skip = ncobmc_skip;
12870       *mbmi = ncobmc_mbmi;
12871 #if CONFIG_VAR_TX
12872       memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
12873 #endif
12874     } else {
12875       x->skip = warp_skip;
12876       *mbmi = warp_mbmi;
12877     }
12878 #else
12879   if (ncobmc_rd < AOMMIN(st_rd, obmc_rd)) {
12880     x->skip = ncobmc_skip;
12881     *mbmi = ncobmc_mbmi;
12882 #if CONFIG_VAR_TX
12883     memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
12884 #endif
12885 #endif  // CONFIG_WARPED_MOTION
12886   } else {
12887     if (obmc_rd < st_rd) {
12888       *mbmi = obmc_mbmi;
12889       x->skip = obmc_skip;
12890 #if CONFIG_VAR_TX
12891       memcpy(x->blk_skip[0], obmc_blk_skip, sizeof(obmc_blk_skip[0]) * n4);
12892 #endif
12893     } else {
12894       *mbmi = st_mbmi;
12895       x->skip = st_skip;
12896 #if CONFIG_VAR_TX
12897       memcpy(x->blk_skip[0], st_blk_skip, sizeof(st_blk_skip[0]) * n4);
12898 #endif
12899     }
12900   }
12901 }
12902 
12903 int64_t get_ncobmc_error(MACROBLOCKD *xd, int pxl_row, int pxl_col,
12904                          BLOCK_SIZE bsize, int plane, struct buf_2d *src) {
12905   const int wide = AOMMIN(mi_size_wide[bsize] * MI_SIZE,
12906                           (xd->sb_mi_bd.mi_col_end + 1) * MI_SIZE - pxl_col);
12907   const int high = AOMMIN(mi_size_high[bsize] * MI_SIZE,
12908                           (xd->sb_mi_bd.mi_row_end + 1) * MI_SIZE - pxl_row);
12909   const int ss_x = xd->plane[plane].subsampling_x;
12910   const int ss_y = xd->plane[plane].subsampling_y;
12911   int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
12912   int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
12913   int dst_stride = xd->ncobmc_pred_buf_stride[plane];
12914   int dst_offset = row_offset * dst_stride + col_offset;
12915   int src_stride = src->stride;
12916 
12917   int r, c;
12918   int64_t tmp, error = 0;
12919 
12920   for (r = 0; r < (high >> ss_y); ++r) {
12921     for (c = 0; c < (wide >> ss_x); ++c) {
12922       tmp = xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] -
12923             src->buf[r * src_stride + c];
12924       error += tmp * tmp;
12925     }
12926   }
12927   return error;
12928 }
12929 
12930 int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
12931                     MACROBLOCKD *xd, int mi_row, int mi_col, int bsize) {
12932   const AV1_COMMON *const cm = &cpi->common;
12933   uint8_t *pred_buf[4][MAX_MB_PLANE];
12934 
12935   // TODO(weitinglin): stride size needs to be fixed for high-bit depth
12936   int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
12937 
12938   // target block in pxl
12939   int pxl_row = mi_row << MI_SIZE_LOG2;
12940   int pxl_col = mi_col << MI_SIZE_LOG2;
12941   int64_t error, best_error = INT64_MAX;
12942   int plane, tmp_mode, best_mode = 0;
12943 #if CONFIG_HIGHBITDEPTH
12944   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
12945     int len = sizeof(uint16_t);
12946     ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
12947                             len);
12948     ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
12949                             len);
12950     ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
12951                             len);
12952     ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
12953                             len);
12954   } else {
12955 #endif  // CONFIG_HIGHBITDEPTH
12956     ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
12957     ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
12958     ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
12959     ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
12960 #if CONFIG_HIGHBITDEPTH
12961   }
12962 #endif
12963 
12964   av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
12965   av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
12966 
12967   for (tmp_mode = 0; tmp_mode < MAX_NCOBMC_MODES; ++tmp_mode) {
12968     error = 0;
12969     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
12970       build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
12971                                pred_stride, tmp_mode);
12972       error += get_ncobmc_error(xd, pxl_row, pxl_col, bsize, plane,
12973                                 &x->plane[plane].src);
12974     }
12975     if (error < best_error) {
12976       best_mode = tmp_mode;
12977       best_error = error;
12978     }
12979   }
12980 
12981   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
12982     build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
12983                              pred_stride, best_mode);
12984   }
12985 
12986   return best_mode;
12987 }
12988 
12989 #endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
12990 #endif  // CONFIG_MOTION_VAR
12991