1 /*
2 * VP9 compatible video decoder
3 *
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "avcodec.h"
25 #include "get_bits.h"
26
27 #include "internal.h"
28
29 #include "thread.h"
30 #include "videodsp.h"
31 #include "vp56.h"
32 #include "vp9.h"
33 #include "vp9data.h"
34 #include "vp9dsp.h"
35 #include "libavutil/avassert.h"
36
37 #define VP9_SYNCCODE 0x498342
38
39 enum CompPredMode {
40 PRED_SINGLEREF,
41 PRED_COMPREF,
42 PRED_SWITCHABLE,
43 };
44
45 enum BlockLevel {
46 BL_64X64,
47 BL_32X32,
48 BL_16X16,
49 BL_8X8,
50 };
51
52 enum BlockSize {
53 BS_64x64,
54 BS_64x32,
55 BS_32x64,
56 BS_32x32,
57 BS_32x16,
58 BS_16x32,
59 BS_16x16,
60 BS_16x8,
61 BS_8x16,
62 BS_8x8,
63 BS_8x4,
64 BS_4x8,
65 BS_4x4,
66 N_BS_SIZES,
67 };
68
69 struct VP9mvrefPair {
70 VP56mv mv[2];
71 int8_t ref[2];
72 };
73
74 typedef struct VP9Frame {
75 ThreadFrame tf;
76 AVBufferRef *extradata;
77 uint8_t *segmentation_map;
78 struct VP9mvrefPair *mv;
79 } VP9Frame;
80
81 struct VP9Filter {
82 uint8_t level[8 * 8];
83 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
84 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
85 };
86
87 typedef struct VP9Block {
88 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
89 enum FilterMode filter;
90 VP56mv mv[4 /* b_idx */][2 /* ref */];
91 enum BlockSize bs;
92 enum TxfmMode tx, uvtx;
93 enum BlockLevel bl;
94 enum BlockPartition bp;
95 } VP9Block;
96
97 typedef struct VP9Context {
98 VP9DSPContext dsp;
99 VideoDSPContext vdsp;
100 GetBitContext gb;
101 VP56RangeCoder c;
102 VP56RangeCoder *c_b;
103 unsigned c_b_size;
104 VP9Block *b_base, *b;
105 int pass, uses_2pass, last_uses_2pass;
106 int row, row7, col, col7;
107 uint8_t *dst[3];
108 ptrdiff_t y_stride, uv_stride;
109
110 // bitstream header
111 uint8_t profile;
112 uint8_t keyframe, last_keyframe;
113 uint8_t invisible;
114 uint8_t use_last_frame_mvs;
115 uint8_t errorres;
116 uint8_t colorspace;
117 uint8_t fullrange;
118 uint8_t intraonly;
119 uint8_t resetctx;
120 uint8_t refreshrefmask;
121 uint8_t highprecisionmvs;
122 enum FilterMode filtermode;
123 uint8_t allowcompinter;
124 uint8_t fixcompref;
125 uint8_t refreshctx;
126 uint8_t parallelmode;
127 uint8_t framectxid;
128 uint8_t refidx[3];
129 uint8_t signbias[3];
130 uint8_t varcompref[2];
131 ThreadFrame refs[8], next_refs[8];
132 #define CUR_FRAME 0
133 #define LAST_FRAME 1
134 VP9Frame frames[2];
135
136 struct {
137 uint8_t level;
138 int8_t sharpness;
139 uint8_t lim_lut[64];
140 uint8_t mblim_lut[64];
141 } filter;
142 struct {
143 uint8_t enabled;
144 int8_t mode[2];
145 int8_t ref[4];
146 } lf_delta;
147 uint8_t yac_qi;
148 int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
149 uint8_t lossless;
150 struct {
151 uint8_t enabled;
152 uint8_t temporal;
153 uint8_t absolute_vals;
154 uint8_t update_map;
155 struct {
156 uint8_t q_enabled;
157 uint8_t lf_enabled;
158 uint8_t ref_enabled;
159 uint8_t skip_enabled;
160 uint8_t ref_val;
161 int16_t q_val;
162 int8_t lf_val;
163 int16_t qmul[2][2];
164 uint8_t lflvl[4][2];
165 } feat[8];
166 } segmentation;
167 struct {
168 unsigned log2_tile_cols, log2_tile_rows;
169 unsigned tile_cols, tile_rows;
170 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
171 } tiling;
172 unsigned sb_cols, sb_rows, rows, cols;
173 struct {
174 prob_context p;
175 uint8_t coef[4][2][2][6][6][3];
176 } prob_ctx[4];
177 struct {
178 prob_context p;
179 uint8_t coef[4][2][2][6][6][11];
180 uint8_t seg[7];
181 uint8_t segpred[3];
182 } prob;
183 struct {
184 unsigned y_mode[4][10];
185 unsigned uv_mode[10][10];
186 unsigned filter[4][3];
187 unsigned mv_mode[7][4];
188 unsigned intra[4][2];
189 unsigned comp[5][2];
190 unsigned single_ref[5][2][2];
191 unsigned comp_ref[5][2];
192 unsigned tx32p[2][4];
193 unsigned tx16p[2][3];
194 unsigned tx8p[2][2];
195 unsigned skip[3][2];
196 unsigned mv_joint[4];
197 struct {
198 unsigned sign[2];
199 unsigned classes[11];
200 unsigned class0[2];
201 unsigned bits[10][2];
202 unsigned class0_fp[2][4];
203 unsigned fp[4];
204 unsigned class0_hp[2];
205 unsigned hp[2];
206 } mv_comp[2];
207 unsigned partition[4][4][4];
208 unsigned coef[4][2][2][6][6][3];
209 unsigned eob[4][2][2][6][6][2];
210 } counts;
211 enum TxfmMode txfmmode;
212 enum CompPredMode comppredmode;
213
214 // contextual (left/above) cache
215 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
216 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
217 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
218 DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
219 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
220 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
221 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
222 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
223 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
224 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
225 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
226 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
227 uint8_t *above_partition_ctx;
228 uint8_t *above_mode_ctx;
229 // FIXME maybe merge some of the below in a flags field?
230 uint8_t *above_y_nnz_ctx;
231 uint8_t *above_uv_nnz_ctx[2];
232 uint8_t *above_skip_ctx; // 1bit
233 uint8_t *above_txfm_ctx; // 2bit
234 uint8_t *above_segpred_ctx; // 1bit
235 uint8_t *above_intra_ctx; // 1bit
236 uint8_t *above_comp_ctx; // 1bit
237 uint8_t *above_ref_ctx; // 2bit
238 uint8_t *above_filter_ctx;
239 VP56mv (*above_mv_ctx)[2];
240
241 // whole-frame cache
242 uint8_t *intra_pred_data[3];
243 struct VP9Filter *lflvl;
244 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
245
246 // block reconstruction intermediates
247 int block_alloc_using_2pass;
248 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
249 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
250 struct { int x, y; } min_mv, max_mv;
251 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
252 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
253 } VP9Context;
254
255 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
256 {
257 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
258 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
259 }, {
260 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
261 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
262 }
263 };
264
vp9_alloc_frame(AVCodecContext * ctx,VP9Frame * f)265 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
266 {
267 VP9Context *s = ctx->priv_data;
268 int ret, sz;
269
270 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
271 return ret;
272 sz = 64 * s->sb_cols * s->sb_rows;
273 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
274 ff_thread_release_buffer(ctx, &f->tf);
275 return AVERROR(ENOMEM);
276 }
277
278 f->segmentation_map = f->extradata->data;
279 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
280
281 // retain segmentation map if it doesn't update
282 if (s->segmentation.enabled && !s->segmentation.update_map &&
283 !s->intraonly && !s->keyframe && !s->errorres) {
284 memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
285 }
286
287 return 0;
288 }
289
vp9_unref_frame(AVCodecContext * ctx,VP9Frame * f)290 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
291 {
292 ff_thread_release_buffer(ctx, &f->tf);
293 av_buffer_unref(&f->extradata);
294 }
295
vp9_ref_frame(AVCodecContext * ctx,VP9Frame * dst,VP9Frame * src)296 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
297 {
298 int res;
299
300 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
301 return res;
302 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
303 vp9_unref_frame(ctx, dst);
304 return AVERROR(ENOMEM);
305 }
306
307 dst->segmentation_map = src->segmentation_map;
308 dst->mv = src->mv;
309
310 return 0;
311 }
312
update_size(AVCodecContext * ctx,int w,int h)313 static int update_size(AVCodecContext *ctx, int w, int h)
314 {
315 VP9Context *s = ctx->priv_data;
316 uint8_t *p;
317
318 av_assert0(w > 0 && h > 0);
319
320 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
321 return 0;
322
323 ctx->width = w;
324 ctx->height = h;
325 s->sb_cols = (w + 63) >> 6;
326 s->sb_rows = (h + 63) >> 6;
327 s->cols = (w + 7) >> 3;
328 s->rows = (h + 7) >> 3;
329
330 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
331 av_freep(&s->intra_pred_data[0]);
332 p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
333 if (!p)
334 return AVERROR(ENOMEM);
335 assign(s->intra_pred_data[0], uint8_t *, 64);
336 assign(s->intra_pred_data[1], uint8_t *, 32);
337 assign(s->intra_pred_data[2], uint8_t *, 32);
338 assign(s->above_y_nnz_ctx, uint8_t *, 16);
339 assign(s->above_mode_ctx, uint8_t *, 16);
340 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
341 assign(s->above_partition_ctx, uint8_t *, 8);
342 assign(s->above_skip_ctx, uint8_t *, 8);
343 assign(s->above_txfm_ctx, uint8_t *, 8);
344 assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
345 assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
346 assign(s->above_segpred_ctx, uint8_t *, 8);
347 assign(s->above_intra_ctx, uint8_t *, 8);
348 assign(s->above_comp_ctx, uint8_t *, 8);
349 assign(s->above_ref_ctx, uint8_t *, 8);
350 assign(s->above_filter_ctx, uint8_t *, 8);
351 assign(s->lflvl, struct VP9Filter *, 1);
352 #undef assign
353
354 // these will be re-allocated a little later
355 av_freep(&s->b_base);
356 av_freep(&s->block_base);
357
358 return 0;
359 }
360
update_block_buffers(AVCodecContext * ctx)361 static int update_block_buffers(AVCodecContext *ctx)
362 {
363 VP9Context *s = ctx->priv_data;
364
365 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
366 return 0;
367
368 av_free(s->b_base);
369 av_free(s->block_base);
370 if (s->uses_2pass) {
371 int sbs = s->sb_cols * s->sb_rows;
372
373 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
374 s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
375 if (!s->b_base || !s->block_base)
376 return AVERROR(ENOMEM);
377 s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
378 s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
379 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
380 s->uveob_base[0] = s->eob_base + 256 * sbs;
381 s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
382 } else {
383 s->b_base = av_malloc(sizeof(VP9Block));
384 s->block_base = av_mallocz((64 * 64 + 128) * 3);
385 if (!s->b_base || !s->block_base)
386 return AVERROR(ENOMEM);
387 s->uvblock_base[0] = s->block_base + 64 * 64;
388 s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
389 s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
390 s->uveob_base[0] = s->eob_base + 256;
391 s->uveob_base[1] = s->uveob_base[0] + 64;
392 }
393 s->block_alloc_using_2pass = s->uses_2pass;
394
395 return 0;
396 }
397
398 // for some reason the sign bit is at the end, not the start, of a bit sequence
get_sbits_inv(GetBitContext * gb,int n)399 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
400 {
401 int v = get_bits(gb, n);
402 return get_bits1(gb) ? -v : v;
403 }
404
inv_recenter_nonneg(int v,int m)405 static av_always_inline int inv_recenter_nonneg(int v, int m)
406 {
407 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
408 }
409
410 // differential forward probability updates
update_prob(VP56RangeCoder * c,int p)411 static int update_prob(VP56RangeCoder *c, int p)
412 {
413 static const int inv_map_table[254] = {
414 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
415 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
416 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
417 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
418 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
419 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
420 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
421 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
422 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
423 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
424 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
425 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
426 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
427 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
428 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
429 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
430 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
431 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
432 252, 253,
433 };
434 int d;
435
436 /* This code is trying to do a differential probability update. For a
437 * current probability A in the range [1, 255], the difference to a new
438 * probability of any value can be expressed differentially as 1-A,255-A
439 * where some part of this (absolute range) exists both in positive as
440 * well as the negative part, whereas another part only exists in one
441 * half. We're trying to code this shared part differentially, i.e.
442 * times two where the value of the lowest bit specifies the sign, and
443 * the single part is then coded on top of this. This absolute difference
444 * then again has a value of [0,254], but a bigger value in this range
445 * indicates that we're further away from the original value A, so we
446 * can code this as a VLC code, since higher values are increasingly
447 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
448 * updates vs. the 'fine, exact' updates further down the range, which
449 * adds one extra dimension to this differential update model. */
450
451 if (!vp8_rac_get(c)) {
452 d = vp8_rac_get_uint(c, 4) + 0;
453 } else if (!vp8_rac_get(c)) {
454 d = vp8_rac_get_uint(c, 4) + 16;
455 } else if (!vp8_rac_get(c)) {
456 d = vp8_rac_get_uint(c, 5) + 32;
457 } else {
458 d = vp8_rac_get_uint(c, 7);
459 if (d >= 65)
460 d = (d << 1) - 65 + vp8_rac_get(c);
461 d += 64;
462 }
463
464 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
465 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
466 }
467
decode_frame_header(AVCodecContext * ctx,const uint8_t * data,int size,int * ref)468 static int decode_frame_header(AVCodecContext *ctx,
469 const uint8_t *data, int size, int *ref)
470 {
471 VP9Context *s = ctx->priv_data;
472 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
473 int last_invisible;
474 const uint8_t *data2;
475
476 /* general header */
477 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
478 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
479 return res;
480 }
481 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
482 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
483 return AVERROR_INVALIDDATA;
484 }
485 s->profile = get_bits1(&s->gb);
486 if (get_bits1(&s->gb)) { // reserved bit
487 av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
488 return AVERROR_INVALIDDATA;
489 }
490 if (get_bits1(&s->gb)) {
491 *ref = get_bits(&s->gb, 3);
492 return 0;
493 }
494 s->last_uses_2pass = s->uses_2pass;
495 s->last_keyframe = s->keyframe;
496 s->keyframe = !get_bits1(&s->gb);
497 last_invisible = s->invisible;
498 s->invisible = !get_bits1(&s->gb);
499 s->errorres = get_bits1(&s->gb);
500 s->use_last_frame_mvs = !s->errorres && !last_invisible;
501 if (s->keyframe) {
502 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
503 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
504 return AVERROR_INVALIDDATA;
505 }
506 s->colorspace = get_bits(&s->gb, 3);
507 if (s->colorspace == 7) { // RGB = profile 1
508 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
509 return AVERROR_INVALIDDATA;
510 }
511 s->fullrange = get_bits1(&s->gb);
512 // for profile 1, here follows the subsampling bits
513 s->refreshrefmask = 0xff;
514 w = get_bits(&s->gb, 16) + 1;
515 h = get_bits(&s->gb, 16) + 1;
516 if (get_bits1(&s->gb)) // display size
517 skip_bits(&s->gb, 32);
518 } else {
519 s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
520 s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
521 if (s->intraonly) {
522 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
523 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
524 return AVERROR_INVALIDDATA;
525 }
526 s->refreshrefmask = get_bits(&s->gb, 8);
527 w = get_bits(&s->gb, 16) + 1;
528 h = get_bits(&s->gb, 16) + 1;
529 if (get_bits1(&s->gb)) // display size
530 skip_bits(&s->gb, 32);
531 } else {
532 s->refreshrefmask = get_bits(&s->gb, 8);
533 s->refidx[0] = get_bits(&s->gb, 3);
534 s->signbias[0] = get_bits1(&s->gb);
535 s->refidx[1] = get_bits(&s->gb, 3);
536 s->signbias[1] = get_bits1(&s->gb);
537 s->refidx[2] = get_bits(&s->gb, 3);
538 s->signbias[2] = get_bits1(&s->gb);
539 if (!s->refs[s->refidx[0]].f->data[0] ||
540 !s->refs[s->refidx[1]].f->data[0] ||
541 !s->refs[s->refidx[2]].f->data[0]) {
542 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
543 return AVERROR_INVALIDDATA;
544 }
545 if (get_bits1(&s->gb)) {
546 w = s->refs[s->refidx[0]].f->width;
547 h = s->refs[s->refidx[0]].f->height;
548 } else if (get_bits1(&s->gb)) {
549 w = s->refs[s->refidx[1]].f->width;
550 h = s->refs[s->refidx[1]].f->height;
551 } else if (get_bits1(&s->gb)) {
552 w = s->refs[s->refidx[2]].f->width;
553 h = s->refs[s->refidx[2]].f->height;
554 } else {
555 w = get_bits(&s->gb, 16) + 1;
556 h = get_bits(&s->gb, 16) + 1;
557 }
558 // Note that in this code, "CUR_FRAME" is actually before we
559 // have formally allocated a frame, and thus actually represents
560 // the _last_ frame
561 s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
562 s->frames[CUR_FRAME].tf.f->height == h;
563 if (get_bits1(&s->gb)) // display size
564 skip_bits(&s->gb, 32);
565 s->highprecisionmvs = get_bits1(&s->gb);
566 s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
567 get_bits(&s->gb, 2);
568 s->allowcompinter = s->signbias[0] != s->signbias[1] ||
569 s->signbias[0] != s->signbias[2];
570 if (s->allowcompinter) {
571 if (s->signbias[0] == s->signbias[1]) {
572 s->fixcompref = 2;
573 s->varcompref[0] = 0;
574 s->varcompref[1] = 1;
575 } else if (s->signbias[0] == s->signbias[2]) {
576 s->fixcompref = 1;
577 s->varcompref[0] = 0;
578 s->varcompref[1] = 2;
579 } else {
580 s->fixcompref = 0;
581 s->varcompref[0] = 1;
582 s->varcompref[1] = 2;
583 }
584 }
585 }
586 }
587 s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
588 s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
589 s->framectxid = c = get_bits(&s->gb, 2);
590
591 /* loopfilter header data */
592 s->filter.level = get_bits(&s->gb, 6);
593 sharp = get_bits(&s->gb, 3);
594 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
595 // the old cache values since they are still valid
596 if (s->filter.sharpness != sharp)
597 memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
598 s->filter.sharpness = sharp;
599 if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
600 if (get_bits1(&s->gb)) {
601 for (i = 0; i < 4; i++)
602 if (get_bits1(&s->gb))
603 s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
604 for (i = 0; i < 2; i++)
605 if (get_bits1(&s->gb))
606 s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
607 }
608 } else {
609 memset(&s->lf_delta, 0, sizeof(s->lf_delta));
610 }
611
612 /* quantization header data */
613 s->yac_qi = get_bits(&s->gb, 8);
614 s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615 s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
616 s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
617 s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
618 s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
619
620 /* segmentation header info */
621 if ((s->segmentation.enabled = get_bits1(&s->gb))) {
622 if ((s->segmentation.update_map = get_bits1(&s->gb))) {
623 for (i = 0; i < 7; i++)
624 s->prob.seg[i] = get_bits1(&s->gb) ?
625 get_bits(&s->gb, 8) : 255;
626 if ((s->segmentation.temporal = get_bits1(&s->gb))) {
627 for (i = 0; i < 3; i++)
628 s->prob.segpred[i] = get_bits1(&s->gb) ?
629 get_bits(&s->gb, 8) : 255;
630 }
631 }
632 if ((!s->segmentation.update_map || s->segmentation.temporal) &&
633 (w != s->frames[CUR_FRAME].tf.f->width ||
634 h != s->frames[CUR_FRAME].tf.f->height)) {
635 av_log(ctx, AV_LOG_ERROR,
636 "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
637 s->segmentation.temporal, s->segmentation.update_map);
638 return AVERROR_INVALIDDATA;
639 }
640
641 if (get_bits1(&s->gb)) {
642 s->segmentation.absolute_vals = get_bits1(&s->gb);
643 for (i = 0; i < 8; i++) {
644 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
645 s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
646 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
647 s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
648 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
649 s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
650 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
651 }
652 }
653 } else {
654 s->segmentation.feat[0].q_enabled = 0;
655 s->segmentation.feat[0].lf_enabled = 0;
656 s->segmentation.feat[0].skip_enabled = 0;
657 s->segmentation.feat[0].ref_enabled = 0;
658 }
659
660 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
661 for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
662 int qyac, qydc, quvac, quvdc, lflvl, sh;
663
664 if (s->segmentation.feat[i].q_enabled) {
665 if (s->segmentation.absolute_vals)
666 qyac = s->segmentation.feat[i].q_val;
667 else
668 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
669 } else {
670 qyac = s->yac_qi;
671 }
672 qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
673 quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
674 quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
675 qyac = av_clip_uintp2(qyac, 8);
676
677 s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
678 s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
679 s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
680 s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
681
682 sh = s->filter.level >= 32;
683 if (s->segmentation.feat[i].lf_enabled) {
684 if (s->segmentation.absolute_vals)
685 lflvl = s->segmentation.feat[i].lf_val;
686 else
687 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
688 } else {
689 lflvl = s->filter.level;
690 }
691 s->segmentation.feat[i].lflvl[0][0] =
692 s->segmentation.feat[i].lflvl[0][1] =
693 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
694 for (j = 1; j < 4; j++) {
695 s->segmentation.feat[i].lflvl[j][0] =
696 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
697 s->lf_delta.mode[0]) << sh), 6);
698 s->segmentation.feat[i].lflvl[j][1] =
699 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
700 s->lf_delta.mode[1]) << sh), 6);
701 }
702 }
703
704 /* tiling info */
705 if ((res = update_size(ctx, w, h)) < 0) {
706 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
707 return res;
708 }
709 for (s->tiling.log2_tile_cols = 0;
710 (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
711 s->tiling.log2_tile_cols++) ;
712 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
713 max = FFMAX(0, max - 1);
714 while (max > s->tiling.log2_tile_cols) {
715 if (get_bits1(&s->gb))
716 s->tiling.log2_tile_cols++;
717 else
718 break;
719 }
720 s->tiling.log2_tile_rows = decode012(&s->gb);
721 s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
722 if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
723 s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
724 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
725 sizeof(VP56RangeCoder) * s->tiling.tile_cols);
726 if (!s->c_b) {
727 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
728 return AVERROR(ENOMEM);
729 }
730 }
731
732 if (s->keyframe || s->errorres || s->intraonly) {
733 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
734 s->prob_ctx[3].p = vp9_default_probs;
735 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
736 sizeof(vp9_default_coef_probs));
737 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
738 sizeof(vp9_default_coef_probs));
739 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
740 sizeof(vp9_default_coef_probs));
741 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
742 sizeof(vp9_default_coef_probs));
743 }
744
745 // next 16 bits is size of the rest of the header (arith-coded)
746 size2 = get_bits(&s->gb, 16);
747 data2 = align_get_bits(&s->gb);
748 if (size2 > size - (data2 - data)) {
749 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
750 return AVERROR_INVALIDDATA;
751 }
752 ff_vp56_init_range_decoder(&s->c, data2, size2);
753 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
754 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
755 return AVERROR_INVALIDDATA;
756 }
757
758 if (s->keyframe || s->intraonly) {
759 memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
760 } else {
761 memset(&s->counts, 0, sizeof(s->counts));
762 }
763 // FIXME is it faster to not copy here, but do it down in the fw updates
764 // as explicit copies if the fw update is missing (and skip the copy upon
765 // fw update)?
766 s->prob.p = s->prob_ctx[c].p;
767
768 // txfm updates
769 if (s->lossless) {
770 s->txfmmode = TX_4X4;
771 } else {
772 s->txfmmode = vp8_rac_get_uint(&s->c, 2);
773 if (s->txfmmode == 3)
774 s->txfmmode += vp8_rac_get(&s->c);
775
776 if (s->txfmmode == TX_SWITCHABLE) {
777 for (i = 0; i < 2; i++)
778 if (vp56_rac_get_prob_branchy(&s->c, 252))
779 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
780 for (i = 0; i < 2; i++)
781 for (j = 0; j < 2; j++)
782 if (vp56_rac_get_prob_branchy(&s->c, 252))
783 s->prob.p.tx16p[i][j] =
784 update_prob(&s->c, s->prob.p.tx16p[i][j]);
785 for (i = 0; i < 2; i++)
786 for (j = 0; j < 3; j++)
787 if (vp56_rac_get_prob_branchy(&s->c, 252))
788 s->prob.p.tx32p[i][j] =
789 update_prob(&s->c, s->prob.p.tx32p[i][j]);
790 }
791 }
792
793 // coef updates
794 for (i = 0; i < 4; i++) {
795 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
796 if (vp8_rac_get(&s->c)) {
797 for (j = 0; j < 2; j++)
798 for (k = 0; k < 2; k++)
799 for (l = 0; l < 6; l++)
800 for (m = 0; m < 6; m++) {
801 uint8_t *p = s->prob.coef[i][j][k][l][m];
802 uint8_t *r = ref[j][k][l][m];
803 if (m >= 3 && l == 0) // dc only has 3 pt
804 break;
805 for (n = 0; n < 3; n++) {
806 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
807 p[n] = update_prob(&s->c, r[n]);
808 } else {
809 p[n] = r[n];
810 }
811 }
812 p[3] = 0;
813 }
814 } else {
815 for (j = 0; j < 2; j++)
816 for (k = 0; k < 2; k++)
817 for (l = 0; l < 6; l++)
818 for (m = 0; m < 6; m++) {
819 uint8_t *p = s->prob.coef[i][j][k][l][m];
820 uint8_t *r = ref[j][k][l][m];
821 if (m > 3 && l == 0) // dc only has 3 pt
822 break;
823 memcpy(p, r, 3);
824 p[3] = 0;
825 }
826 }
827 if (s->txfmmode == i)
828 break;
829 }
830
831 // mode updates
832 for (i = 0; i < 3; i++)
833 if (vp56_rac_get_prob_branchy(&s->c, 252))
834 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
835 if (!s->keyframe && !s->intraonly) {
836 for (i = 0; i < 7; i++)
837 for (j = 0; j < 3; j++)
838 if (vp56_rac_get_prob_branchy(&s->c, 252))
839 s->prob.p.mv_mode[i][j] =
840 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
841
842 if (s->filtermode == FILTER_SWITCHABLE)
843 for (i = 0; i < 4; i++)
844 for (j = 0; j < 2; j++)
845 if (vp56_rac_get_prob_branchy(&s->c, 252))
846 s->prob.p.filter[i][j] =
847 update_prob(&s->c, s->prob.p.filter[i][j]);
848
849 for (i = 0; i < 4; i++)
850 if (vp56_rac_get_prob_branchy(&s->c, 252))
851 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
852
853 if (s->allowcompinter) {
854 s->comppredmode = vp8_rac_get(&s->c);
855 if (s->comppredmode)
856 s->comppredmode += vp8_rac_get(&s->c);
857 if (s->comppredmode == PRED_SWITCHABLE)
858 for (i = 0; i < 5; i++)
859 if (vp56_rac_get_prob_branchy(&s->c, 252))
860 s->prob.p.comp[i] =
861 update_prob(&s->c, s->prob.p.comp[i]);
862 } else {
863 s->comppredmode = PRED_SINGLEREF;
864 }
865
866 if (s->comppredmode != PRED_COMPREF) {
867 for (i = 0; i < 5; i++) {
868 if (vp56_rac_get_prob_branchy(&s->c, 252))
869 s->prob.p.single_ref[i][0] =
870 update_prob(&s->c, s->prob.p.single_ref[i][0]);
871 if (vp56_rac_get_prob_branchy(&s->c, 252))
872 s->prob.p.single_ref[i][1] =
873 update_prob(&s->c, s->prob.p.single_ref[i][1]);
874 }
875 }
876
877 if (s->comppredmode != PRED_SINGLEREF) {
878 for (i = 0; i < 5; i++)
879 if (vp56_rac_get_prob_branchy(&s->c, 252))
880 s->prob.p.comp_ref[i] =
881 update_prob(&s->c, s->prob.p.comp_ref[i]);
882 }
883
884 for (i = 0; i < 4; i++)
885 for (j = 0; j < 9; j++)
886 if (vp56_rac_get_prob_branchy(&s->c, 252))
887 s->prob.p.y_mode[i][j] =
888 update_prob(&s->c, s->prob.p.y_mode[i][j]);
889
890 for (i = 0; i < 4; i++)
891 for (j = 0; j < 4; j++)
892 for (k = 0; k < 3; k++)
893 if (vp56_rac_get_prob_branchy(&s->c, 252))
894 s->prob.p.partition[3 - i][j][k] =
895 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
896
897 // mv fields don't use the update_prob subexp model for some reason
898 for (i = 0; i < 3; i++)
899 if (vp56_rac_get_prob_branchy(&s->c, 252))
900 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
901
902 for (i = 0; i < 2; i++) {
903 if (vp56_rac_get_prob_branchy(&s->c, 252))
904 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
905
906 for (j = 0; j < 10; j++)
907 if (vp56_rac_get_prob_branchy(&s->c, 252))
908 s->prob.p.mv_comp[i].classes[j] =
909 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
910
911 if (vp56_rac_get_prob_branchy(&s->c, 252))
912 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
913
914 for (j = 0; j < 10; j++)
915 if (vp56_rac_get_prob_branchy(&s->c, 252))
916 s->prob.p.mv_comp[i].bits[j] =
917 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
918 }
919
920 for (i = 0; i < 2; i++) {
921 for (j = 0; j < 2; j++)
922 for (k = 0; k < 3; k++)
923 if (vp56_rac_get_prob_branchy(&s->c, 252))
924 s->prob.p.mv_comp[i].class0_fp[j][k] =
925 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
926
927 for (j = 0; j < 3; j++)
928 if (vp56_rac_get_prob_branchy(&s->c, 252))
929 s->prob.p.mv_comp[i].fp[j] =
930 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
931 }
932
933 if (s->highprecisionmvs) {
934 for (i = 0; i < 2; i++) {
935 if (vp56_rac_get_prob_branchy(&s->c, 252))
936 s->prob.p.mv_comp[i].class0_hp =
937 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
938
939 if (vp56_rac_get_prob_branchy(&s->c, 252))
940 s->prob.p.mv_comp[i].hp =
941 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
942 }
943 }
944 }
945
946 return (data2 - data) + size2;
947 }
948
clamp_mv(VP56mv * dst,const VP56mv * src,VP9Context * s)949 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
950 VP9Context *s)
951 {
952 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
953 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
954 }
955
find_ref_mvs(VP9Context * s,VP56mv * pmv,int ref,int z,int idx,int sb)956 static void find_ref_mvs(VP9Context *s,
957 VP56mv *pmv, int ref, int z, int idx, int sb)
958 {
959 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
960 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
961 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
962 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
963 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
964 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
965 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
966 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
967 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
969 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
970 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
971 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
972 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
973 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
974 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
975 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
976 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
977 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
978 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
985 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
986 };
987 VP9Block *b = s->b;
988 int row = s->row, col = s->col, row7 = s->row7;
989 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
990 #define INVALID_MV 0x80008000U
991 uint32_t mem = INVALID_MV;
992 int i;
993
994 #define RETURN_DIRECT_MV(mv) \
995 do { \
996 uint32_t m = AV_RN32A(&mv); \
997 if (!idx) { \
998 AV_WN32A(pmv, m); \
999 return; \
1000 } else if (mem == INVALID_MV) { \
1001 mem = m; \
1002 } else if (m != mem) { \
1003 AV_WN32A(pmv, m); \
1004 return; \
1005 } \
1006 } while (0)
1007
1008 if (sb >= 0) {
1009 if (sb == 2 || sb == 1) {
1010 RETURN_DIRECT_MV(b->mv[0][z]);
1011 } else if (sb == 3) {
1012 RETURN_DIRECT_MV(b->mv[2][z]);
1013 RETURN_DIRECT_MV(b->mv[1][z]);
1014 RETURN_DIRECT_MV(b->mv[0][z]);
1015 }
1016
1017 #define RETURN_MV(mv) \
1018 do { \
1019 if (sb > 0) { \
1020 VP56mv tmp; \
1021 uint32_t m; \
1022 clamp_mv(&tmp, &mv, s); \
1023 m = AV_RN32A(&tmp); \
1024 if (!idx) { \
1025 AV_WN32A(pmv, m); \
1026 return; \
1027 } else if (mem == INVALID_MV) { \
1028 mem = m; \
1029 } else if (m != mem) { \
1030 AV_WN32A(pmv, m); \
1031 return; \
1032 } \
1033 } else { \
1034 uint32_t m = AV_RN32A(&mv); \
1035 if (!idx) { \
1036 clamp_mv(pmv, &mv, s); \
1037 return; \
1038 } else if (mem == INVALID_MV) { \
1039 mem = m; \
1040 } else if (m != mem) { \
1041 clamp_mv(pmv, &mv, s); \
1042 return; \
1043 } \
1044 } \
1045 } while (0)
1046
1047 if (row > 0) {
1048 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1049 if (mv->ref[0] == ref) {
1050 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1051 } else if (mv->ref[1] == ref) {
1052 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1053 }
1054 }
1055 if (col > s->tiling.tile_col_start) {
1056 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1057 if (mv->ref[0] == ref) {
1058 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1059 } else if (mv->ref[1] == ref) {
1060 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1061 }
1062 }
1063 i = 2;
1064 } else {
1065 i = 0;
1066 }
1067
1068 // previously coded MVs in this neighbourhood, using same reference frame
1069 for (; i < 8; i++) {
1070 int c = p[i][0] + col, r = p[i][1] + row;
1071
1072 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1073 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1074
1075 if (mv->ref[0] == ref) {
1076 RETURN_MV(mv->mv[0]);
1077 } else if (mv->ref[1] == ref) {
1078 RETURN_MV(mv->mv[1]);
1079 }
1080 }
1081 }
1082
1083 // MV at this position in previous frame, using same reference frame
1084 if (s->use_last_frame_mvs) {
1085 struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1086
1087 if (!s->last_uses_2pass)
1088 ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1089 if (mv->ref[0] == ref) {
1090 RETURN_MV(mv->mv[0]);
1091 } else if (mv->ref[1] == ref) {
1092 RETURN_MV(mv->mv[1]);
1093 }
1094 }
1095
1096 #define RETURN_SCALE_MV(mv, scale) \
1097 do { \
1098 if (scale) { \
1099 VP56mv mv_temp = { -mv.x, -mv.y }; \
1100 RETURN_MV(mv_temp); \
1101 } else { \
1102 RETURN_MV(mv); \
1103 } \
1104 } while (0)
1105
1106 // previously coded MVs in this neighbourhood, using different reference frame
1107 for (i = 0; i < 8; i++) {
1108 int c = p[i][0] + col, r = p[i][1] + row;
1109
1110 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1111 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1112
1113 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1114 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1115 }
1116 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1117 // BUG - libvpx has this condition regardless of whether
1118 // we used the first ref MV and pre-scaling
1119 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1120 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1121 }
1122 }
1123 }
1124
1125 // MV at this position in previous frame, using different reference frame
1126 if (s->use_last_frame_mvs) {
1127 struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1128
1129 // no need to await_progress, because we already did that above
1130 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1131 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1132 }
1133 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1134 // BUG - libvpx has this condition regardless of whether
1135 // we used the first ref MV and pre-scaling
1136 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1137 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1138 }
1139 }
1140
1141 AV_ZERO32(pmv);
1142 #undef INVALID_MV
1143 #undef RETURN_MV
1144 #undef RETURN_SCALE_MV
1145 }
1146
read_mv_component(VP9Context * s,int idx,int hp)1147 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1148 {
1149 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1150 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1151 s->prob.p.mv_comp[idx].classes);
1152
1153 s->counts.mv_comp[idx].sign[sign]++;
1154 s->counts.mv_comp[idx].classes[c]++;
1155 if (c) {
1156 int m;
1157
1158 for (n = 0, m = 0; m < c; m++) {
1159 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1160 n |= bit << m;
1161 s->counts.mv_comp[idx].bits[m][bit]++;
1162 }
1163 n <<= 3;
1164 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1165 n |= bit << 1;
1166 s->counts.mv_comp[idx].fp[bit]++;
1167 if (hp) {
1168 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1169 s->counts.mv_comp[idx].hp[bit]++;
1170 n |= bit;
1171 } else {
1172 n |= 1;
1173 // bug in libvpx - we count for bw entropy purposes even if the
1174 // bit wasn't coded
1175 s->counts.mv_comp[idx].hp[1]++;
1176 }
1177 n += 8 << c;
1178 } else {
1179 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1180 s->counts.mv_comp[idx].class0[n]++;
1181 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1182 s->prob.p.mv_comp[idx].class0_fp[n]);
1183 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1184 n = (n << 3) | (bit << 1);
1185 if (hp) {
1186 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1187 s->counts.mv_comp[idx].class0_hp[bit]++;
1188 n |= bit;
1189 } else {
1190 n |= 1;
1191 // bug in libvpx - we count for bw entropy purposes even if the
1192 // bit wasn't coded
1193 s->counts.mv_comp[idx].class0_hp[1]++;
1194 }
1195 }
1196
1197 return sign ? -(n + 1) : (n + 1);
1198 }
1199
fill_mv(VP9Context * s,VP56mv * mv,int mode,int sb)1200 static void fill_mv(VP9Context *s,
1201 VP56mv *mv, int mode, int sb)
1202 {
1203 VP9Block *b = s->b;
1204
1205 if (mode == ZEROMV) {
1206 AV_ZERO64(mv);
1207 } else {
1208 int hp = 0;
1209
1210 // FIXME cache this value and reuse for other subblocks
1211 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1212 mode == NEWMV ? -1 : sb);
1213 // FIXME maybe move this code into find_ref_mvs()
1214 if ((mode == NEWMV || sb == -1) &&
1215 !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1216 if (mv[0].y & 1) {
1217 if (mv[0].y < 0)
1218 mv[0].y++;
1219 else
1220 mv[0].y--;
1221 }
1222 if (mv[0].x & 1) {
1223 if (mv[0].x < 0)
1224 mv[0].x++;
1225 else
1226 mv[0].x--;
1227 }
1228 }
1229 if (mode == NEWMV) {
1230 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1231 s->prob.p.mv_joint);
1232
1233 s->counts.mv_joint[j]++;
1234 if (j >= MV_JOINT_V)
1235 mv[0].y += read_mv_component(s, 0, hp);
1236 if (j & 1)
1237 mv[0].x += read_mv_component(s, 1, hp);
1238 }
1239
1240 if (b->comp) {
1241 // FIXME cache this value and reuse for other subblocks
1242 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1243 mode == NEWMV ? -1 : sb);
1244 if ((mode == NEWMV || sb == -1) &&
1245 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1246 if (mv[1].y & 1) {
1247 if (mv[1].y < 0)
1248 mv[1].y++;
1249 else
1250 mv[1].y--;
1251 }
1252 if (mv[1].x & 1) {
1253 if (mv[1].x < 0)
1254 mv[1].x++;
1255 else
1256 mv[1].x--;
1257 }
1258 }
1259 if (mode == NEWMV) {
1260 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1261 s->prob.p.mv_joint);
1262
1263 s->counts.mv_joint[j]++;
1264 if (j >= MV_JOINT_V)
1265 mv[1].y += read_mv_component(s, 0, hp);
1266 if (j & 1)
1267 mv[1].x += read_mv_component(s, 1, hp);
1268 }
1269 }
1270 }
1271 }
1272
setctx_2d(uint8_t * ptr,int w,int h,ptrdiff_t stride,int v)1273 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1274 ptrdiff_t stride, int v)
1275 {
1276 switch (w) {
1277 case 1:
1278 do {
1279 *ptr = v;
1280 ptr += stride;
1281 } while (--h);
1282 break;
1283 case 2: {
1284 int v16 = v * 0x0101;
1285 do {
1286 AV_WN16A(ptr, v16);
1287 ptr += stride;
1288 } while (--h);
1289 break;
1290 }
1291 case 4: {
1292 uint32_t v32 = v * 0x01010101;
1293 do {
1294 AV_WN32A(ptr, v32);
1295 ptr += stride;
1296 } while (--h);
1297 break;
1298 }
1299 case 8: {
1300 #if HAVE_FAST_64BIT
1301 uint64_t v64 = v * 0x0101010101010101ULL;
1302 do {
1303 AV_WN64A(ptr, v64);
1304 ptr += stride;
1305 } while (--h);
1306 #else
1307 uint32_t v32 = v * 0x01010101;
1308 do {
1309 AV_WN32A(ptr, v32);
1310 AV_WN32A(ptr + 4, v32);
1311 ptr += stride;
1312 } while (--h);
1313 #endif
1314 break;
1315 }
1316 }
1317 }
1318
decode_mode(AVCodecContext * ctx)1319 static void decode_mode(AVCodecContext *ctx)
1320 {
1321 static const uint8_t left_ctx[N_BS_SIZES] = {
1322 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1323 };
1324 static const uint8_t above_ctx[N_BS_SIZES] = {
1325 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1326 };
1327 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1328 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1329 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1330 };
1331 VP9Context *s = ctx->priv_data;
1332 VP9Block *b = s->b;
1333 int row = s->row, col = s->col, row7 = s->row7;
1334 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1335 int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1336 int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1337 int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1338 int vref = 0;
1339 int filter_id = 0;
1340
1341 if (!s->segmentation.enabled) {
1342 b->seg_id = 0;
1343 } else if (s->keyframe || s->intraonly) {
1344 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1345 } else if (!s->segmentation.update_map ||
1346 (s->segmentation.temporal &&
1347 vp56_rac_get_prob_branchy(&s->c,
1348 s->prob.segpred[s->above_segpred_ctx[col] +
1349 s->left_segpred_ctx[row7]]))) {
1350 if (!s->errorres) {
1351 int pred = 8, x;
1352 uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1353
1354 if (!s->last_uses_2pass)
1355 ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1356 for (y = 0; y < h4; y++)
1357 for (x = 0; x < w4; x++)
1358 pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1359 av_assert1(pred < 8);
1360 b->seg_id = pred;
1361 } else {
1362 b->seg_id = 0;
1363 }
1364
1365 memset(&s->above_segpred_ctx[col], 1, w4);
1366 memset(&s->left_segpred_ctx[row7], 1, h4);
1367 } else {
1368 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1369 s->prob.seg);
1370
1371 memset(&s->above_segpred_ctx[col], 0, w4);
1372 memset(&s->left_segpred_ctx[row7], 0, h4);
1373 }
1374 if (s->segmentation.enabled &&
1375 (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1376 setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1377 w4, h4, 8 * s->sb_cols, b->seg_id);
1378 }
1379
1380 b->skip = s->segmentation.enabled &&
1381 s->segmentation.feat[b->seg_id].skip_enabled;
1382 if (!b->skip) {
1383 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1384 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1385 s->counts.skip[c][b->skip]++;
1386 }
1387
1388 if (s->keyframe || s->intraonly) {
1389 b->intra = 1;
1390 } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1391 b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1392 } else {
1393 int c, bit;
1394
1395 if (have_a && have_l) {
1396 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1397 c += (c == 2);
1398 } else {
1399 c = have_a ? 2 * s->above_intra_ctx[col] :
1400 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1401 }
1402 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1403 s->counts.intra[c][bit]++;
1404 b->intra = !bit;
1405 }
1406
1407 if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1408 int c;
1409 if (have_a) {
1410 if (have_l) {
1411 c = (s->above_skip_ctx[col] ? max_tx :
1412 s->above_txfm_ctx[col]) +
1413 (s->left_skip_ctx[row7] ? max_tx :
1414 s->left_txfm_ctx[row7]) > max_tx;
1415 } else {
1416 c = s->above_skip_ctx[col] ? 1 :
1417 (s->above_txfm_ctx[col] * 2 > max_tx);
1418 }
1419 } else if (have_l) {
1420 c = s->left_skip_ctx[row7] ? 1 :
1421 (s->left_txfm_ctx[row7] * 2 > max_tx);
1422 } else {
1423 c = 1;
1424 }
1425 switch (max_tx) {
1426 case TX_32X32:
1427 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1428 if (b->tx) {
1429 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1430 if (b->tx == 2)
1431 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1432 }
1433 s->counts.tx32p[c][b->tx]++;
1434 break;
1435 case TX_16X16:
1436 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1437 if (b->tx)
1438 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1439 s->counts.tx16p[c][b->tx]++;
1440 break;
1441 case TX_8X8:
1442 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1443 s->counts.tx8p[c][b->tx]++;
1444 break;
1445 case TX_4X4:
1446 b->tx = TX_4X4;
1447 break;
1448 }
1449 } else {
1450 b->tx = FFMIN(max_tx, s->txfmmode);
1451 }
1452
1453 if (s->keyframe || s->intraonly) {
1454 uint8_t *a = &s->above_mode_ctx[col * 2];
1455 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1456
1457 b->comp = 0;
1458 if (b->bs > BS_8x8) {
1459 // FIXME the memory storage intermediates here aren't really
1460 // necessary, they're just there to make the code slightly
1461 // simpler for now
1462 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1463 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1464 if (b->bs != BS_8x4) {
1465 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1467 l[0] = a[1] = b->mode[1];
1468 } else {
1469 l[0] = a[1] = b->mode[1] = b->mode[0];
1470 }
1471 if (b->bs != BS_4x8) {
1472 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1473 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1474 if (b->bs != BS_8x4) {
1475 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1476 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1477 l[1] = a[1] = b->mode[3];
1478 } else {
1479 l[1] = a[1] = b->mode[3] = b->mode[2];
1480 }
1481 } else {
1482 b->mode[2] = b->mode[0];
1483 l[1] = a[1] = b->mode[3] = b->mode[1];
1484 }
1485 } else {
1486 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1487 vp9_default_kf_ymode_probs[*a][*l]);
1488 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1489 // FIXME this can probably be optimized
1490 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1491 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1492 }
1493 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1494 vp9_default_kf_uvmode_probs[b->mode[3]]);
1495 } else if (b->intra) {
1496 b->comp = 0;
1497 if (b->bs > BS_8x8) {
1498 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1499 s->prob.p.y_mode[0]);
1500 s->counts.y_mode[0][b->mode[0]]++;
1501 if (b->bs != BS_8x4) {
1502 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1503 s->prob.p.y_mode[0]);
1504 s->counts.y_mode[0][b->mode[1]]++;
1505 } else {
1506 b->mode[1] = b->mode[0];
1507 }
1508 if (b->bs != BS_4x8) {
1509 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1510 s->prob.p.y_mode[0]);
1511 s->counts.y_mode[0][b->mode[2]]++;
1512 if (b->bs != BS_8x4) {
1513 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1514 s->prob.p.y_mode[0]);
1515 s->counts.y_mode[0][b->mode[3]]++;
1516 } else {
1517 b->mode[3] = b->mode[2];
1518 }
1519 } else {
1520 b->mode[2] = b->mode[0];
1521 b->mode[3] = b->mode[1];
1522 }
1523 } else {
1524 static const uint8_t size_group[10] = {
1525 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1526 };
1527 int sz = size_group[b->bs];
1528
1529 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1530 s->prob.p.y_mode[sz]);
1531 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1532 s->counts.y_mode[sz][b->mode[3]]++;
1533 }
1534 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1535 s->prob.p.uv_mode[b->mode[3]]);
1536 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1537 } else {
1538 static const uint8_t inter_mode_ctx_lut[14][14] = {
1539 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1543 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1544 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1545 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1546 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1547 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1548 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1549 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1550 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1551 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1552 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1553 };
1554
1555 if (s->segmentation.feat[b->seg_id].ref_enabled) {
1556 av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1557 b->comp = 0;
1558 b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1559 } else {
1560 // read comp_pred flag
1561 if (s->comppredmode != PRED_SWITCHABLE) {
1562 b->comp = s->comppredmode == PRED_COMPREF;
1563 } else {
1564 int c;
1565
1566 // FIXME add intra as ref=0xff (or -1) to make these easier?
1567 if (have_a) {
1568 if (have_l) {
1569 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1570 c = 4;
1571 } else if (s->above_comp_ctx[col]) {
1572 c = 2 + (s->left_intra_ctx[row7] ||
1573 s->left_ref_ctx[row7] == s->fixcompref);
1574 } else if (s->left_comp_ctx[row7]) {
1575 c = 2 + (s->above_intra_ctx[col] ||
1576 s->above_ref_ctx[col] == s->fixcompref);
1577 } else {
1578 c = (!s->above_intra_ctx[col] &&
1579 s->above_ref_ctx[col] == s->fixcompref) ^
1580 (!s->left_intra_ctx[row7] &&
1581 s->left_ref_ctx[row & 7] == s->fixcompref);
1582 }
1583 } else {
1584 c = s->above_comp_ctx[col] ? 3 :
1585 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1586 }
1587 } else if (have_l) {
1588 c = s->left_comp_ctx[row7] ? 3 :
1589 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1590 } else {
1591 c = 1;
1592 }
1593 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1594 s->counts.comp[c][b->comp]++;
1595 }
1596
1597 // read actual references
1598 // FIXME probably cache a few variables here to prevent repetitive
1599 // memory accesses below
1600 if (b->comp) /* two references */ {
1601 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1602
1603 b->ref[fix_idx] = s->fixcompref;
1604 // FIXME can this codeblob be replaced by some sort of LUT?
1605 if (have_a) {
1606 if (have_l) {
1607 if (s->above_intra_ctx[col]) {
1608 if (s->left_intra_ctx[row7]) {
1609 c = 2;
1610 } else {
1611 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1612 }
1613 } else if (s->left_intra_ctx[row7]) {
1614 c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1615 } else {
1616 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1617
1618 if (refl == refa && refa == s->varcompref[1]) {
1619 c = 0;
1620 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1621 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1622 (refl == s->fixcompref && refa == s->varcompref[0])) {
1623 c = 4;
1624 } else {
1625 c = (refa == refl) ? 3 : 1;
1626 }
1627 } else if (!s->left_comp_ctx[row7]) {
1628 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1629 c = 1;
1630 } else {
1631 c = (refl == s->varcompref[1] &&
1632 refa != s->varcompref[1]) ? 2 : 4;
1633 }
1634 } else if (!s->above_comp_ctx[col]) {
1635 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1636 c = 1;
1637 } else {
1638 c = (refa == s->varcompref[1] &&
1639 refl != s->varcompref[1]) ? 2 : 4;
1640 }
1641 } else {
1642 c = (refl == refa) ? 4 : 2;
1643 }
1644 }
1645 } else {
1646 if (s->above_intra_ctx[col]) {
1647 c = 2;
1648 } else if (s->above_comp_ctx[col]) {
1649 c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1650 } else {
1651 c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1652 }
1653 }
1654 } else if (have_l) {
1655 if (s->left_intra_ctx[row7]) {
1656 c = 2;
1657 } else if (s->left_comp_ctx[row7]) {
1658 c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1659 } else {
1660 c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1661 }
1662 } else {
1663 c = 2;
1664 }
1665 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1666 b->ref[var_idx] = s->varcompref[bit];
1667 s->counts.comp_ref[c][bit]++;
1668 } else /* single reference */ {
1669 int bit, c;
1670
1671 if (have_a && !s->above_intra_ctx[col]) {
1672 if (have_l && !s->left_intra_ctx[row7]) {
1673 if (s->left_comp_ctx[row7]) {
1674 if (s->above_comp_ctx[col]) {
1675 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1676 !s->above_ref_ctx[col]);
1677 } else {
1678 c = (3 * !s->above_ref_ctx[col]) +
1679 (!s->fixcompref || !s->left_ref_ctx[row7]);
1680 }
1681 } else if (s->above_comp_ctx[col]) {
1682 c = (3 * !s->left_ref_ctx[row7]) +
1683 (!s->fixcompref || !s->above_ref_ctx[col]);
1684 } else {
1685 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1686 }
1687 } else if (s->above_intra_ctx[col]) {
1688 c = 2;
1689 } else if (s->above_comp_ctx[col]) {
1690 c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1691 } else {
1692 c = 4 * (!s->above_ref_ctx[col]);
1693 }
1694 } else if (have_l && !s->left_intra_ctx[row7]) {
1695 if (s->left_intra_ctx[row7]) {
1696 c = 2;
1697 } else if (s->left_comp_ctx[row7]) {
1698 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1699 } else {
1700 c = 4 * (!s->left_ref_ctx[row7]);
1701 }
1702 } else {
1703 c = 2;
1704 }
1705 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1706 s->counts.single_ref[c][0][bit]++;
1707 if (!bit) {
1708 b->ref[0] = 0;
1709 } else {
1710 // FIXME can this codeblob be replaced by some sort of LUT?
1711 if (have_a) {
1712 if (have_l) {
1713 if (s->left_intra_ctx[row7]) {
1714 if (s->above_intra_ctx[col]) {
1715 c = 2;
1716 } else if (s->above_comp_ctx[col]) {
1717 c = 1 + 2 * (s->fixcompref == 1 ||
1718 s->above_ref_ctx[col] == 1);
1719 } else if (!s->above_ref_ctx[col]) {
1720 c = 3;
1721 } else {
1722 c = 4 * (s->above_ref_ctx[col] == 1);
1723 }
1724 } else if (s->above_intra_ctx[col]) {
1725 if (s->left_intra_ctx[row7]) {
1726 c = 2;
1727 } else if (s->left_comp_ctx[row7]) {
1728 c = 1 + 2 * (s->fixcompref == 1 ||
1729 s->left_ref_ctx[row7] == 1);
1730 } else if (!s->left_ref_ctx[row7]) {
1731 c = 3;
1732 } else {
1733 c = 4 * (s->left_ref_ctx[row7] == 1);
1734 }
1735 } else if (s->above_comp_ctx[col]) {
1736 if (s->left_comp_ctx[row7]) {
1737 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1738 c = 3 * (s->fixcompref == 1 ||
1739 s->left_ref_ctx[row7] == 1);
1740 } else {
1741 c = 2;
1742 }
1743 } else if (!s->left_ref_ctx[row7]) {
1744 c = 1 + 2 * (s->fixcompref == 1 ||
1745 s->above_ref_ctx[col] == 1);
1746 } else {
1747 c = 3 * (s->left_ref_ctx[row7] == 1) +
1748 (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1749 }
1750 } else if (s->left_comp_ctx[row7]) {
1751 if (!s->above_ref_ctx[col]) {
1752 c = 1 + 2 * (s->fixcompref == 1 ||
1753 s->left_ref_ctx[row7] == 1);
1754 } else {
1755 c = 3 * (s->above_ref_ctx[col] == 1) +
1756 (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1757 }
1758 } else if (!s->above_ref_ctx[col]) {
1759 if (!s->left_ref_ctx[row7]) {
1760 c = 3;
1761 } else {
1762 c = 4 * (s->left_ref_ctx[row7] == 1);
1763 }
1764 } else if (!s->left_ref_ctx[row7]) {
1765 c = 4 * (s->above_ref_ctx[col] == 1);
1766 } else {
1767 c = 2 * (s->left_ref_ctx[row7] == 1) +
1768 2 * (s->above_ref_ctx[col] == 1);
1769 }
1770 } else {
1771 if (s->above_intra_ctx[col] ||
1772 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1773 c = 2;
1774 } else if (s->above_comp_ctx[col]) {
1775 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1776 } else {
1777 c = 4 * (s->above_ref_ctx[col] == 1);
1778 }
1779 }
1780 } else if (have_l) {
1781 if (s->left_intra_ctx[row7] ||
1782 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1783 c = 2;
1784 } else if (s->left_comp_ctx[row7]) {
1785 c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1786 } else {
1787 c = 4 * (s->left_ref_ctx[row7] == 1);
1788 }
1789 } else {
1790 c = 2;
1791 }
1792 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1793 s->counts.single_ref[c][1][bit]++;
1794 b->ref[0] = 1 + bit;
1795 }
1796 }
1797 }
1798
1799 if (b->bs <= BS_8x8) {
1800 if (s->segmentation.feat[b->seg_id].skip_enabled) {
1801 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1802 } else {
1803 static const uint8_t off[10] = {
1804 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1805 };
1806
1807 // FIXME this needs to use the LUT tables from find_ref_mvs
1808 // because not all are -1,0/0,-1
1809 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1810 [s->left_mode_ctx[row7 + off[b->bs]]];
1811
1812 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1813 s->prob.p.mv_mode[c]);
1814 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1815 s->counts.mv_mode[c][b->mode[0] - 10]++;
1816 }
1817 }
1818
1819 if (s->filtermode == FILTER_SWITCHABLE) {
1820 int c;
1821
1822 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1823 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1824 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1825 s->left_filter_ctx[row7] : 3;
1826 } else {
1827 c = s->above_filter_ctx[col];
1828 }
1829 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1830 c = s->left_filter_ctx[row7];
1831 } else {
1832 c = 3;
1833 }
1834
1835 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1836 s->prob.p.filter[c]);
1837 s->counts.filter[c][filter_id]++;
1838 b->filter = vp9_filter_lut[filter_id];
1839 } else {
1840 b->filter = s->filtermode;
1841 }
1842
1843 if (b->bs > BS_8x8) {
1844 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1845
1846 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1847 s->prob.p.mv_mode[c]);
1848 s->counts.mv_mode[c][b->mode[0] - 10]++;
1849 fill_mv(s, b->mv[0], b->mode[0], 0);
1850
1851 if (b->bs != BS_8x4) {
1852 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1853 s->prob.p.mv_mode[c]);
1854 s->counts.mv_mode[c][b->mode[1] - 10]++;
1855 fill_mv(s, b->mv[1], b->mode[1], 1);
1856 } else {
1857 b->mode[1] = b->mode[0];
1858 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1859 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1860 }
1861
1862 if (b->bs != BS_4x8) {
1863 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1864 s->prob.p.mv_mode[c]);
1865 s->counts.mv_mode[c][b->mode[2] - 10]++;
1866 fill_mv(s, b->mv[2], b->mode[2], 2);
1867
1868 if (b->bs != BS_8x4) {
1869 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1870 s->prob.p.mv_mode[c]);
1871 s->counts.mv_mode[c][b->mode[3] - 10]++;
1872 fill_mv(s, b->mv[3], b->mode[3], 3);
1873 } else {
1874 b->mode[3] = b->mode[2];
1875 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1876 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1877 }
1878 } else {
1879 b->mode[2] = b->mode[0];
1880 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1881 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1882 b->mode[3] = b->mode[1];
1883 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1884 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1885 }
1886 } else {
1887 fill_mv(s, b->mv[0], b->mode[0], -1);
1888 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1889 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1890 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1891 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1892 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1893 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1894 }
1895
1896 vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1897 }
1898
1899 #if HAVE_FAST_64BIT
1900 #define SPLAT_CTX(var, val, n) \
1901 switch (n) { \
1902 case 1: var = val; break; \
1903 case 2: AV_WN16A(&var, val * 0x0101); break; \
1904 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1905 case 8: AV_WN64A(&var, val * ULLN(0x0101010101010101)); break; \
1906 case 16: { \
1907 uint64_t v64 = val * ULLN(0x0101010101010101); \
1908 AV_WN64A( &var, v64); \
1909 AV_WN64A(&((uint8_t *) &var)[8], v64); \
1910 break; \
1911 } \
1912 }
1913 #else
1914 #define SPLAT_CTX(var, val, n) \
1915 switch (n) { \
1916 case 1: var = val; break; \
1917 case 2: AV_WN16A(&var, val * 0x0101); break; \
1918 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1919 case 8: { \
1920 uint32_t v32 = val * 0x01010101; \
1921 AV_WN32A( &var, v32); \
1922 AV_WN32A(&((uint8_t *) &var)[4], v32); \
1923 break; \
1924 } \
1925 case 16: { \
1926 uint32_t v32 = val * 0x01010101; \
1927 AV_WN32A( &var, v32); \
1928 AV_WN32A(&((uint8_t *) &var)[4], v32); \
1929 AV_WN32A(&((uint8_t *) &var)[8], v32); \
1930 AV_WN32A(&((uint8_t *) &var)[12], v32); \
1931 break; \
1932 } \
1933 }
1934 #endif
1935
1936 switch (bwh_tab[1][b->bs][0]) {
1937 #define SET_CTXS(dir, off, n) \
1938 do { \
1939 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1940 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1941 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1942 if (!s->keyframe && !s->intraonly) { \
1943 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1944 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1945 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1946 if (!b->intra) { \
1947 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1948 if (s->filtermode == FILTER_SWITCHABLE) { \
1949 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1950 } \
1951 } \
1952 } \
1953 } while (0)
1954 case 1: SET_CTXS(above, col, 1); break;
1955 case 2: SET_CTXS(above, col, 2); break;
1956 case 4: SET_CTXS(above, col, 4); break;
1957 case 8: SET_CTXS(above, col, 8); break;
1958 }
1959 switch (bwh_tab[1][b->bs][1]) {
1960 case 1: SET_CTXS(left, row7, 1); break;
1961 case 2: SET_CTXS(left, row7, 2); break;
1962 case 4: SET_CTXS(left, row7, 4); break;
1963 case 8: SET_CTXS(left, row7, 8); break;
1964 }
1965 #undef SPLAT_CTX
1966 #undef SET_CTXS
1967
1968 if (!s->keyframe && !s->intraonly) {
1969 if (b->bs > BS_8x8) {
1970 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1971
1972 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1973 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1974 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1975 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1976 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1977 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1978 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1979 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1980 } else {
1981 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1982
1983 for (n = 0; n < w4 * 2; n++) {
1984 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1985 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1986 }
1987 for (n = 0; n < h4 * 2; n++) {
1988 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1989 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1990 }
1991 }
1992 }
1993
1994 // FIXME kinda ugly
1995 for (y = 0; y < h4; y++) {
1996 int x, o = (row + y) * s->sb_cols * 8 + col;
1997 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1998
1999 if (b->intra) {
2000 for (x = 0; x < w4; x++) {
2001 mv[x].ref[0] =
2002 mv[x].ref[1] = -1;
2003 }
2004 } else if (b->comp) {
2005 for (x = 0; x < w4; x++) {
2006 mv[x].ref[0] = b->ref[0];
2007 mv[x].ref[1] = b->ref[1];
2008 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2009 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2010 }
2011 } else {
2012 for (x = 0; x < w4; x++) {
2013 mv[x].ref[0] = b->ref[0];
2014 mv[x].ref[1] = -1;
2015 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2016 }
2017 }
2018 }
2019 }
2020
2021 // FIXME merge cnt/eob arguments?
2022 static av_always_inline int
decode_coeffs_b_generic(VP56RangeCoder * c,int16_t * coef,int n_coeffs,int is_tx32x32,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,const int16_t * qmul)2023 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2024 int is_tx32x32, unsigned (*cnt)[6][3],
2025 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2026 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2027 const int16_t *band_counts, const int16_t *qmul)
2028 {
2029 int i = 0, band = 0, band_left = band_counts[band];
2030 uint8_t *tp = p[0][nnz];
2031 uint8_t cache[1024];
2032
2033 do {
2034 int val, rc;
2035
2036 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2037 eob[band][nnz][val]++;
2038 if (!val)
2039 break;
2040
2041 skip_eob:
2042 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2043 cnt[band][nnz][0]++;
2044 if (!--band_left)
2045 band_left = band_counts[++band];
2046 cache[scan[i]] = 0;
2047 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2048 tp = p[band][nnz];
2049 if (++i == n_coeffs)
2050 break; //invalid input; blocks should end with EOB
2051 goto skip_eob;
2052 }
2053
2054 rc = scan[i];
2055 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2056 cnt[band][nnz][1]++;
2057 val = 1;
2058 cache[rc] = 1;
2059 } else {
2060 // fill in p[3-10] (model fill) - only once per frame for each pos
2061 if (!tp[3])
2062 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2063
2064 cnt[band][nnz][2]++;
2065 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2066 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2067 cache[rc] = val = 2;
2068 } else {
2069 val = 3 + vp56_rac_get_prob(c, tp[5]);
2070 cache[rc] = 3;
2071 }
2072 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2073 cache[rc] = 4;
2074 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2075 val = 5 + vp56_rac_get_prob(c, 159);
2076 } else {
2077 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2078 val += vp56_rac_get_prob(c, 145);
2079 }
2080 } else { // cat 3-6
2081 cache[rc] = 5;
2082 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2083 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2084 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2085 val += (vp56_rac_get_prob(c, 148) << 1);
2086 val += vp56_rac_get_prob(c, 140);
2087 } else {
2088 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2089 val += (vp56_rac_get_prob(c, 155) << 2);
2090 val += (vp56_rac_get_prob(c, 140) << 1);
2091 val += vp56_rac_get_prob(c, 135);
2092 }
2093 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2094 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2095 val += (vp56_rac_get_prob(c, 157) << 3);
2096 val += (vp56_rac_get_prob(c, 141) << 2);
2097 val += (vp56_rac_get_prob(c, 134) << 1);
2098 val += vp56_rac_get_prob(c, 130);
2099 } else {
2100 val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2101 val += (vp56_rac_get_prob(c, 254) << 12);
2102 val += (vp56_rac_get_prob(c, 254) << 11);
2103 val += (vp56_rac_get_prob(c, 252) << 10);
2104 val += (vp56_rac_get_prob(c, 249) << 9);
2105 val += (vp56_rac_get_prob(c, 243) << 8);
2106 val += (vp56_rac_get_prob(c, 230) << 7);
2107 val += (vp56_rac_get_prob(c, 196) << 6);
2108 val += (vp56_rac_get_prob(c, 177) << 5);
2109 val += (vp56_rac_get_prob(c, 153) << 4);
2110 val += (vp56_rac_get_prob(c, 140) << 3);
2111 val += (vp56_rac_get_prob(c, 133) << 2);
2112 val += (vp56_rac_get_prob(c, 130) << 1);
2113 val += vp56_rac_get_prob(c, 129);
2114 }
2115 }
2116 }
2117 if (!--band_left)
2118 band_left = band_counts[++band];
2119 if (is_tx32x32)
2120 coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2121 else
2122 coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2123 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2124 tp = p[band][nnz];
2125 } while (++i < n_coeffs);
2126
2127 return i;
2128 }
2129
decode_coeffs_b(VP56RangeCoder * c,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,const int16_t * qmul)2130 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2131 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2132 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2133 const int16_t (*nb)[2], const int16_t *band_counts,
2134 const int16_t *qmul)
2135 {
2136 return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2137 nnz, scan, nb, band_counts, qmul);
2138 }
2139
decode_coeffs_b32(VP56RangeCoder * c,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,const int16_t * qmul)2140 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2141 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2142 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2143 const int16_t (*nb)[2], const int16_t *band_counts,
2144 const int16_t *qmul)
2145 {
2146 return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2147 nnz, scan, nb, band_counts, qmul);
2148 }
2149
decode_coeffs(AVCodecContext * ctx)2150 static void decode_coeffs(AVCodecContext *ctx)
2151 {
2152 VP9Context *s = ctx->priv_data;
2153 VP9Block *b = s->b;
2154 int row = s->row, col = s->col;
2155 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2156 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2157 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2158 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2159 int end_x = FFMIN(2 * (s->cols - col), w4);
2160 int end_y = FFMIN(2 * (s->rows - row), h4);
2161 int n, pl, x, y, res;
2162 int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2163 int tx = 4 * s->lossless + b->tx;
2164 const int16_t * const *yscans = vp9_scans[tx];
2165 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2166 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2167 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2168 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2169 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2170 static const int16_t band_counts[4][8] = {
2171 { 1, 2, 3, 4, 3, 16 - 13 },
2172 { 1, 2, 3, 4, 11, 64 - 21 },
2173 { 1, 2, 3, 4, 11, 256 - 21 },
2174 { 1, 2, 3, 4, 11, 1024 - 21 },
2175 };
2176 const int16_t *y_band_counts = band_counts[b->tx];
2177 const int16_t *uv_band_counts = band_counts[b->uvtx];
2178
2179 #define MERGE(la, end, step, rd) \
2180 for (n = 0; n < end; n += step) \
2181 la[n] = !!rd(&la[n])
2182 #define MERGE_CTX(step, rd) \
2183 do { \
2184 MERGE(l, end_y, step, rd); \
2185 MERGE(a, end_x, step, rd); \
2186 } while (0)
2187
2188 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2189 for (n = 0, y = 0; y < end_y; y += step) { \
2190 for (x = 0; x < end_x; x += step, n += step * step) { \
2191 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2192 res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2193 c, e, p, a[x] + l[y], yscans[txtp], \
2194 ynbs[txtp], y_band_counts, qmul[0]); \
2195 a[x] = l[y] = !!res; \
2196 if (step >= 4) { \
2197 AV_WN16A(&s->eob[n], res); \
2198 } else { \
2199 s->eob[n] = res; \
2200 } \
2201 } \
2202 }
2203
2204 #define SPLAT(la, end, step, cond) \
2205 if (step == 2) { \
2206 for (n = 1; n < end; n += step) \
2207 la[n] = la[n - 1]; \
2208 } else if (step == 4) { \
2209 if (cond) { \
2210 for (n = 0; n < end; n += step) \
2211 AV_WN32A(&la[n], la[n] * 0x01010101); \
2212 } else { \
2213 for (n = 0; n < end; n += step) \
2214 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2215 } \
2216 } else /* step == 8 */ { \
2217 if (cond) { \
2218 if (HAVE_FAST_64BIT) { \
2219 for (n = 0; n < end; n += step) \
2220 AV_WN64A(&la[n], la[n] * ULLN(0x0101010101010101)); \
2221 } else { \
2222 for (n = 0; n < end; n += step) { \
2223 uint32_t v32 = la[n] * 0x01010101; \
2224 AV_WN32A(&la[n], v32); \
2225 AV_WN32A(&la[n + 4], v32); \
2226 } \
2227 } \
2228 } else { \
2229 for (n = 0; n < end; n += step) \
2230 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2231 } \
2232 }
2233 #define SPLAT_CTX(step) \
2234 do { \
2235 SPLAT(a, end_x, step, end_x == w4); \
2236 SPLAT(l, end_y, step, end_y == h4); \
2237 } while (0)
2238
2239 /* y tokens */
2240 switch (b->tx) {
2241 case TX_4X4:
2242 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2243 break;
2244 case TX_8X8:
2245 MERGE_CTX(2, AV_RN16A);
2246 DECODE_Y_COEF_LOOP(2, 0,);
2247 SPLAT_CTX(2);
2248 break;
2249 case TX_16X16:
2250 MERGE_CTX(4, AV_RN32A);
2251 DECODE_Y_COEF_LOOP(4, 0,);
2252 SPLAT_CTX(4);
2253 break;
2254 case TX_32X32:
2255 MERGE_CTX(8, AV_RN64A);
2256 DECODE_Y_COEF_LOOP(8, 0, 32);
2257 SPLAT_CTX(8);
2258 break;
2259 }
2260
2261 #define DECODE_UV_COEF_LOOP(step) \
2262 for (n = 0, y = 0; y < end_y; y += step) { \
2263 for (x = 0; x < end_x; x += step, n += step * step) { \
2264 res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2265 16 * step * step, c, e, p, a[x] + l[y], \
2266 uvscan, uvnb, uv_band_counts, qmul[1]); \
2267 a[x] = l[y] = !!res; \
2268 if (step >= 4) { \
2269 AV_WN16A(&s->uveob[pl][n], res); \
2270 } else { \
2271 s->uveob[pl][n] = res; \
2272 } \
2273 } \
2274 }
2275
2276 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2277 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2278 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2279 w4 >>= 1;
2280 h4 >>= 1;
2281 end_x >>= 1;
2282 end_y >>= 1;
2283 for (pl = 0; pl < 2; pl++) {
2284 a = &s->above_uv_nnz_ctx[pl][col];
2285 l = &s->left_uv_nnz_ctx[pl][row & 7];
2286 switch (b->uvtx) {
2287 case TX_4X4:
2288 DECODE_UV_COEF_LOOP(1);
2289 break;
2290 case TX_8X8:
2291 MERGE_CTX(2, AV_RN16A);
2292 DECODE_UV_COEF_LOOP(2);
2293 SPLAT_CTX(2);
2294 break;
2295 case TX_16X16:
2296 MERGE_CTX(4, AV_RN32A);
2297 DECODE_UV_COEF_LOOP(4);
2298 SPLAT_CTX(4);
2299 break;
2300 case TX_32X32:
2301 MERGE_CTX(8, AV_RN64A);
2302 // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2303 // so there is no need to loop
2304 res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2305 1024, c, e, p, a[0] + l[0],
2306 uvscan, uvnb, uv_band_counts, qmul[1]);
2307 a[0] = l[0] = !!res;
2308 AV_WN16A(&s->uveob[pl][0], res);
2309 SPLAT_CTX(8);
2310 break;
2311 }
2312 }
2313 }
2314
check_intra_mode(VP9Context * s,int mode,uint8_t ** a,uint8_t * dst_edge,ptrdiff_t stride_edge,uint8_t * dst_inner,ptrdiff_t stride_inner,uint8_t * l,int col,int x,int w,int row,int y,enum TxfmMode tx,int p)2315 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2316 uint8_t *dst_edge, ptrdiff_t stride_edge,
2317 uint8_t *dst_inner, ptrdiff_t stride_inner,
2318 uint8_t *l, int col, int x, int w,
2319 int row, int y, enum TxfmMode tx,
2320 int p)
2321 {
2322 int have_top = row > 0 || y > 0;
2323 int have_left = col > s->tiling.tile_col_start || x > 0;
2324 int have_right = x < w - 1;
2325 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2326 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2327 { DC_127_PRED, VERT_PRED } },
2328 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2329 { HOR_PRED, HOR_PRED } },
2330 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2331 { LEFT_DC_PRED, DC_PRED } },
2332 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2333 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2334 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2335 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2336 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2337 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2338 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2339 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2340 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2341 { DC_127_PRED, VERT_LEFT_PRED } },
2342 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2343 { HOR_UP_PRED, HOR_UP_PRED } },
2344 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2345 { HOR_PRED, TM_VP8_PRED } },
2346 };
2347 static const struct {
2348 uint8_t needs_left:1;
2349 uint8_t needs_top:1;
2350 uint8_t needs_topleft:1;
2351 uint8_t needs_topright:1;
2352 } edges[N_INTRA_PRED_MODES] = {
2353 [VERT_PRED] = { .needs_top = 1 },
2354 [HOR_PRED] = { .needs_left = 1 },
2355 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2356 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2357 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2358 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2359 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2360 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2361 [HOR_UP_PRED] = { .needs_left = 1 },
2362 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2363 [LEFT_DC_PRED] = { .needs_left = 1 },
2364 [TOP_DC_PRED] = { .needs_top = 1 },
2365 [DC_128_PRED] = { 0 },
2366 [DC_127_PRED] = { 0 },
2367 [DC_129_PRED] = { 0 }
2368 };
2369
2370 av_assert2(mode >= 0 && mode < 10);
2371 mode = mode_conv[mode][have_left][have_top];
2372 if (edges[mode].needs_top) {
2373 uint8_t *top = NULL;
2374 uint8_t *topleft = NULL;
2375 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2376 int n_px_need_tr = 0;
2377
2378 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2379 n_px_need_tr = 4;
2380
2381 // if top of sb64-row, use s->intra_pred_data[] instead of
2382 // dst[-stride] for intra prediction (it contains pre- instead of
2383 // post-loopfilter data)
2384 if (have_top) {
2385 top = !(row & 7) && !y ?
2386 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2387 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2388 if (have_left)
2389 topleft = !(row & 7) && !y ?
2390 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2391 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2392 &dst_inner[-stride_inner];
2393 }
2394
2395 if (have_top &&
2396 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2397 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2398 n_px_need + n_px_need_tr <= n_px_have) {
2399 *a = top;
2400 } else {
2401 if (have_top) {
2402 if (n_px_need <= n_px_have) {
2403 memcpy(*a, top, n_px_need);
2404 } else {
2405 memcpy(*a, top, n_px_have);
2406 memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2407 n_px_need - n_px_have);
2408 }
2409 } else {
2410 memset(*a, 127, n_px_need);
2411 }
2412 if (edges[mode].needs_topleft) {
2413 if (have_left && have_top) {
2414 (*a)[-1] = topleft[-1];
2415 } else {
2416 (*a)[-1] = have_top ? 129 : 127;
2417 }
2418 }
2419 if (tx == TX_4X4 && edges[mode].needs_topright) {
2420 if (have_top && have_right &&
2421 n_px_need + n_px_need_tr <= n_px_have) {
2422 memcpy(&(*a)[4], &top[4], 4);
2423 } else {
2424 memset(&(*a)[4], (*a)[3], 4);
2425 }
2426 }
2427 }
2428 }
2429 if (edges[mode].needs_left) {
2430 if (have_left) {
2431 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2432 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2433 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2434
2435 if (n_px_need <= n_px_have) {
2436 for (i = 0; i < n_px_need; i++)
2437 l[n_px_need - 1 - i] = dst[i * stride - 1];
2438 } else {
2439 for (i = 0; i < n_px_have; i++)
2440 l[n_px_need - 1 - i] = dst[i * stride - 1];
2441 memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2442 }
2443 } else {
2444 memset(l, 129, 4 << tx);
2445 }
2446 }
2447
2448 return mode;
2449 }
2450
intra_recon(AVCodecContext * ctx,ptrdiff_t y_off,ptrdiff_t uv_off)2451 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2452 {
2453 VP9Context *s = ctx->priv_data;
2454 VP9Block *b = s->b;
2455 int row = s->row, col = s->col;
2456 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2457 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2458 int end_x = FFMIN(2 * (s->cols - col), w4);
2459 int end_y = FFMIN(2 * (s->rows - row), h4);
2460 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2461 int uvstep1d = 1 << b->uvtx, p;
2462 uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2463
2464 LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2465 LOCAL_ALIGNED_32(uint8_t, l, [32]);
2466
2467 for (n = 0, y = 0; y < end_y; y += step1d) {
2468 uint8_t *ptr = dst, *ptr_r = dst_r;
2469 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2470 ptr_r += 4 * step1d, n += step) {
2471 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2472 y * 2 + x : 0];
2473 uint8_t *a = &a_buf[32];
2474 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2475 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2476
2477 mode = check_intra_mode(s, mode, &a, ptr_r,
2478 s->frames[CUR_FRAME].tf.f->linesize[0],
2479 ptr, s->y_stride, l,
2480 col, x, w4, row, y, b->tx, 0);
2481 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2482 if (eob)
2483 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2484 s->block + 16 * n, eob);
2485 }
2486 dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2487 dst += 4 * step1d * s->y_stride;
2488 }
2489
2490 // U/V
2491 w4 >>= 1;
2492 end_x >>= 1;
2493 end_y >>= 1;
2494 step = 1 << (b->uvtx * 2);
2495 for (p = 0; p < 2; p++) {
2496 dst = s->dst[1 + p];
2497 dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2498 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2499 uint8_t *ptr = dst, *ptr_r = dst_r;
2500 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2501 ptr_r += 4 * uvstep1d, n += step) {
2502 int mode = b->uvmode;
2503 uint8_t *a = &a_buf[16];
2504 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2505
2506 mode = check_intra_mode(s, mode, &a, ptr_r,
2507 s->frames[CUR_FRAME].tf.f->linesize[1],
2508 ptr, s->uv_stride, l,
2509 col, x, w4, row, y, b->uvtx, p + 1);
2510 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2511 if (eob)
2512 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2513 s->uvblock[p] + 16 * n, eob);
2514 }
2515 dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2516 dst += 4 * uvstep1d * s->uv_stride;
2517 }
2518 }
2519 }
2520
mc_luma_dir(VP9Context * s,vp9_mc_func (* mc)[2],uint8_t * dst,ptrdiff_t dst_stride,const uint8_t * ref,ptrdiff_t ref_stride,ThreadFrame * ref_frame,ptrdiff_t y,ptrdiff_t x,const VP56mv * mv,int bw,int bh,int w,int h)2521 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2522 uint8_t *dst, ptrdiff_t dst_stride,
2523 const uint8_t *ref, ptrdiff_t ref_stride,
2524 ThreadFrame *ref_frame,
2525 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2526 int bw, int bh, int w, int h)
2527 {
2528 int mx = mv->x, my = mv->y, th;
2529
2530 y += my >> 3;
2531 x += mx >> 3;
2532 ref += y * ref_stride + x;
2533 mx &= 7;
2534 my &= 7;
2535 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2536 // we use +7 because the last 7 pixels of each sbrow can be changed in
2537 // the longest loopfilter of the next sbrow
2538 th = (y + bh + 4 * !!my + 7) >> 6;
2539 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2540 if (x < !!mx * 3 || y < !!my * 3 ||
2541 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2542 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2543 ref - !!my * 3 * ref_stride - !!mx * 3,
2544 80, ref_stride,
2545 bw + !!mx * 7, bh + !!my * 7,
2546 x - !!mx * 3, y - !!my * 3, w, h);
2547 ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2548 ref_stride = 80;
2549 }
2550 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2551 }
2552
mc_chroma_dir(VP9Context * s,vp9_mc_func (* mc)[2],uint8_t * dst_u,uint8_t * dst_v,ptrdiff_t dst_stride,const uint8_t * ref_u,ptrdiff_t src_stride_u,const uint8_t * ref_v,ptrdiff_t src_stride_v,ThreadFrame * ref_frame,ptrdiff_t y,ptrdiff_t x,const VP56mv * mv,int bw,int bh,int w,int h)2553 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2554 uint8_t *dst_u, uint8_t *dst_v,
2555 ptrdiff_t dst_stride,
2556 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2557 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2558 ThreadFrame *ref_frame,
2559 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2560 int bw, int bh, int w, int h)
2561 {
2562 int mx = mv->x, my = mv->y, th;
2563
2564 y += my >> 4;
2565 x += mx >> 4;
2566 ref_u += y * src_stride_u + x;
2567 ref_v += y * src_stride_v + x;
2568 mx &= 15;
2569 my &= 15;
2570 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2571 // we use +7 because the last 7 pixels of each sbrow can be changed in
2572 // the longest loopfilter of the next sbrow
2573 th = (y + bh + 4 * !!my + 7) >> 5;
2574 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2575 if (x < !!mx * 3 || y < !!my * 3 ||
2576 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2577 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2578 ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2579 80, src_stride_u,
2580 bw + !!mx * 7, bh + !!my * 7,
2581 x - !!mx * 3, y - !!my * 3, w, h);
2582 ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2583 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2584
2585 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2586 ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2587 80, src_stride_v,
2588 bw + !!mx * 7, bh + !!my * 7,
2589 x - !!mx * 3, y - !!my * 3, w, h);
2590 ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2591 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2592 } else {
2593 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2594 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2595 }
2596 }
2597
inter_recon(AVCodecContext * ctx)2598 static void inter_recon(AVCodecContext *ctx)
2599 {
2600 static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2601 { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2602 { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2603 };
2604 VP9Context *s = ctx->priv_data;
2605 VP9Block *b = s->b;
2606 int row = s->row, col = s->col;
2607 ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]];
2608 ThreadFrame *tref2 = NULL;
2609 AVFrame *ref1 = tref1->f;
2610 AVFrame *ref2 = NULL;
2611 int w1 = ref1->width, h1 = ref1->height;
2612 int w2 = 0;
2613 int h2 = 0;
2614 ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2615
2616 if (b->comp) {
2617 tref2 = &s->refs[s->refidx[b->ref[1]]];
2618 ref2 = tref2->f;
2619 w2 = ref2->width;
2620 h2 = ref2->height;
2621 }
2622
2623 // y inter pred
2624 if (b->bs > BS_8x8) {
2625 if (b->bs == BS_8x4) {
2626 mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2627 ref1->data[0], ref1->linesize[0], tref1,
2628 row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2629 mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2630 s->dst[0] + 4 * ls_y, ls_y,
2631 ref1->data[0], ref1->linesize[0], tref1,
2632 (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2633
2634 if (b->comp) {
2635 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2636 ref2->data[0], ref2->linesize[0], tref2,
2637 row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2638 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2639 s->dst[0] + 4 * ls_y, ls_y,
2640 ref2->data[0], ref2->linesize[0], tref2,
2641 (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2642 }
2643 } else if (b->bs == BS_4x8) {
2644 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2645 ref1->data[0], ref1->linesize[0], tref1,
2646 row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2647 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2648 ref1->data[0], ref1->linesize[0], tref1,
2649 row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2650
2651 if (b->comp) {
2652 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2653 ref2->data[0], ref2->linesize[0], tref2,
2654 row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2655 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2656 ref2->data[0], ref2->linesize[0], tref2,
2657 row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2658 }
2659 } else {
2660 av_assert2(b->bs == BS_4x4);
2661
2662 // FIXME if two horizontally adjacent blocks have the same MV,
2663 // do a w8 instead of a w4 call
2664 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2665 ref1->data[0], ref1->linesize[0], tref1,
2666 row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2667 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2668 ref1->data[0], ref1->linesize[0], tref1,
2669 row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2670 mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2671 s->dst[0] + 4 * ls_y, ls_y,
2672 ref1->data[0], ref1->linesize[0], tref1,
2673 (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2674 mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2675 s->dst[0] + 4 * ls_y + 4, ls_y,
2676 ref1->data[0], ref1->linesize[0], tref1,
2677 (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2678
2679 if (b->comp) {
2680 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2681 ref2->data[0], ref2->linesize[0], tref2,
2682 row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2683 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2684 ref2->data[0], ref2->linesize[0], tref2,
2685 row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2686 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2687 s->dst[0] + 4 * ls_y, ls_y,
2688 ref2->data[0], ref2->linesize[0], tref2,
2689 (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2690 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2691 s->dst[0] + 4 * ls_y + 4, ls_y,
2692 ref2->data[0], ref2->linesize[0], tref2,
2693 (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2694 }
2695 }
2696 } else {
2697 int bwl = bwlog_tab[0][b->bs];
2698 int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2699
2700 mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2701 ref1->data[0], ref1->linesize[0], tref1,
2702 row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2703
2704 if (b->comp)
2705 mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2706 ref2->data[0], ref2->linesize[0], tref2,
2707 row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2708 }
2709
2710 // uv inter pred
2711 {
2712 int bwl = bwlog_tab[1][b->bs];
2713 int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2714 VP56mv mvuv;
2715
2716 w1 = (w1 + 1) >> 1;
2717 h1 = (h1 + 1) >> 1;
2718 if (b->comp) {
2719 w2 = (w2 + 1) >> 1;
2720 h2 = (h2 + 1) >> 1;
2721 }
2722 if (b->bs > BS_8x8) {
2723 mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2724 mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2725 } else {
2726 mvuv = b->mv[0][0];
2727 }
2728
2729 mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2730 s->dst[1], s->dst[2], ls_uv,
2731 ref1->data[1], ref1->linesize[1],
2732 ref1->data[2], ref1->linesize[2], tref1,
2733 row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2734
2735 if (b->comp) {
2736 if (b->bs > BS_8x8) {
2737 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2738 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2739 } else {
2740 mvuv = b->mv[0][1];
2741 }
2742 mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2743 s->dst[1], s->dst[2], ls_uv,
2744 ref2->data[1], ref2->linesize[1],
2745 ref2->data[2], ref2->linesize[2], tref2,
2746 row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2747 }
2748 }
2749
2750 if (!b->skip) {
2751 /* mostly copied intra_reconn() */
2752
2753 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2754 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2755 int end_x = FFMIN(2 * (s->cols - col), w4);
2756 int end_y = FFMIN(2 * (s->rows - row), h4);
2757 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2758 int uvstep1d = 1 << b->uvtx, p;
2759 uint8_t *dst = s->dst[0];
2760
2761 // y itxfm add
2762 for (n = 0, y = 0; y < end_y; y += step1d) {
2763 uint8_t *ptr = dst;
2764 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2765 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2766
2767 if (eob)
2768 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2769 s->block + 16 * n, eob);
2770 }
2771 dst += 4 * s->y_stride * step1d;
2772 }
2773
2774 // uv itxfm add
2775 end_x >>= 1;
2776 end_y >>= 1;
2777 step = 1 << (b->uvtx * 2);
2778 for (p = 0; p < 2; p++) {
2779 dst = s->dst[p + 1];
2780 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2781 uint8_t *ptr = dst;
2782 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2783 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2784
2785 if (eob)
2786 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2787 s->uvblock[p] + 16 * n, eob);
2788 }
2789 dst += 4 * uvstep1d * s->uv_stride;
2790 }
2791 }
2792 }
2793 }
2794
mask_edges(struct VP9Filter * lflvl,int is_uv,int row_and_7,int col_and_7,int w,int h,int col_end,int row_end,enum TxfmMode tx,int skip_inter)2795 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2796 int row_and_7, int col_and_7,
2797 int w, int h, int col_end, int row_end,
2798 enum TxfmMode tx, int skip_inter)
2799 {
2800 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2801 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2802 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2803 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2804
2805 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2806 // edges. This means that for UV, we work on two subsampled blocks at
2807 // a time, and we only use the topleft block's mode information to set
2808 // things like block strength. Thus, for any block size smaller than
2809 // 16x16, ignore the odd portion of the block.
2810 if (tx == TX_4X4 && is_uv) {
2811 if (h == 1) {
2812 if (row_and_7 & 1)
2813 return;
2814 if (!row_end)
2815 h += 1;
2816 }
2817 if (w == 1) {
2818 if (col_and_7 & 1)
2819 return;
2820 if (!col_end)
2821 w += 1;
2822 }
2823 }
2824
2825 if (tx == TX_4X4 && !skip_inter) {
2826 int t = 1 << col_and_7, m_col = (t << w) - t, y;
2827 int m_col_odd = (t << (w - 1)) - t;
2828
2829 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2830 if (is_uv) {
2831 int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2832
2833 for (y = row_and_7; y < h + row_and_7; y++) {
2834 int col_mask_id = 2 - !(y & 7);
2835
2836 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2837 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2838 // for odd lines, if the odd col is not being filtered,
2839 // skip odd row also:
2840 // .---. <-- a
2841 // | |
2842 // |___| <-- b
2843 // ^ ^
2844 // c d
2845 //
2846 // if a/c are even row/col and b/d are odd, and d is skipped,
2847 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2848 if ((col_end & 1) && (y & 1)) {
2849 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2850 } else {
2851 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2852 }
2853 }
2854 } else {
2855 int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2856
2857 for (y = row_and_7; y < h + row_and_7; y++) {
2858 int col_mask_id = 2 - !(y & 3);
2859
2860 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2861 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2862 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2863 lflvl->mask[is_uv][0][y][3] |= m_col;
2864 lflvl->mask[is_uv][1][y][3] |= m_col;
2865 }
2866 }
2867 } else {
2868 int y, t = 1 << col_and_7, m_col = (t << w) - t;
2869
2870 if (!skip_inter) {
2871 int mask_id = (tx == TX_8X8);
2872 int l2 = tx + is_uv - 1, step1d = 1 << l2;
2873 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2874 int m_row = m_col & masks[l2];
2875
2876 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2877 // 8wd loopfilter to prevent going off the visible edge.
2878 if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2879 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2880 int m_row_8 = m_row - m_row_16;
2881
2882 for (y = row_and_7; y < h + row_and_7; y++) {
2883 lflvl->mask[is_uv][0][y][0] |= m_row_16;
2884 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2885 }
2886 } else {
2887 for (y = row_and_7; y < h + row_and_7; y++)
2888 lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2889 }
2890
2891 if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2892 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2893 lflvl->mask[is_uv][1][y][0] |= m_col;
2894 if (y - row_and_7 == h - 1)
2895 lflvl->mask[is_uv][1][y][1] |= m_col;
2896 } else {
2897 for (y = row_and_7; y < h + row_and_7; y += step1d)
2898 lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2899 }
2900 } else if (tx != TX_4X4) {
2901 int mask_id;
2902
2903 mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2904 lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2905 mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2906 for (y = row_and_7; y < h + row_and_7; y++)
2907 lflvl->mask[is_uv][0][y][mask_id] |= t;
2908 } else if (is_uv) {
2909 int t8 = t & 0x01, t4 = t - t8;
2910
2911 for (y = row_and_7; y < h + row_and_7; y++) {
2912 lflvl->mask[is_uv][0][y][2] |= t4;
2913 lflvl->mask[is_uv][0][y][1] |= t8;
2914 }
2915 lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2916 } else {
2917 int t8 = t & 0x11, t4 = t - t8;
2918
2919 for (y = row_and_7; y < h + row_and_7; y++) {
2920 lflvl->mask[is_uv][0][y][2] |= t4;
2921 lflvl->mask[is_uv][0][y][1] |= t8;
2922 }
2923 lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2924 }
2925 }
2926 }
2927
decode_b(AVCodecContext * ctx,int row,int col,struct VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl,enum BlockPartition bp)2928 static void decode_b(AVCodecContext *ctx, int row, int col,
2929 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2930 enum BlockLevel bl, enum BlockPartition bp)
2931 {
2932 VP9Context *s = ctx->priv_data;
2933 VP9Block *b = s->b;
2934 enum BlockSize bs = bl * 3 + bp;
2935 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2936 int emu[2];
2937 AVFrame *f = s->frames[CUR_FRAME].tf.f;
2938
2939 s->row = row;
2940 s->row7 = row & 7;
2941 s->col = col;
2942 s->col7 = col & 7;
2943 s->min_mv.x = -(128 + col * 64);
2944 s->min_mv.y = -(128 + row * 64);
2945 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2946 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2947 if (s->pass < 2) {
2948 b->bs = bs;
2949 b->bl = bl;
2950 b->bp = bp;
2951 decode_mode(ctx);
2952 b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2953
2954 if (!b->skip) {
2955 decode_coeffs(ctx);
2956 } else {
2957 int row7 = s->row7;
2958
2959 #define SPLAT_ZERO_CTX(v, n) \
2960 switch (n) { \
2961 case 1: v = 0; break; \
2962 case 2: AV_ZERO16(&v); break; \
2963 case 4: AV_ZERO32(&v); break; \
2964 case 8: AV_ZERO64(&v); break; \
2965 case 16: AV_ZERO128(&v); break; \
2966 }
2967 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2968 do { \
2969 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2970 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2971 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2972 } while (0)
2973
2974 switch (w4) {
2975 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2976 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2977 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2978 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2979 }
2980 switch (h4) {
2981 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2982 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2983 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2984 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2985 }
2986 }
2987 if (s->pass == 1) {
2988 s->b++;
2989 s->block += w4 * h4 * 64;
2990 s->uvblock[0] += w4 * h4 * 16;
2991 s->uvblock[1] += w4 * h4 * 16;
2992 s->eob += 4 * w4 * h4;
2993 s->uveob[0] += w4 * h4;
2994 s->uveob[1] += w4 * h4;
2995
2996 return;
2997 }
2998 }
2999
3000 // emulated overhangs if the stride of the target buffer can't hold. This
3001 // allows to support emu-edge and so on even if we have large block
3002 // overhangs
3003 emu[0] = (col + w4) * 8 > f->linesize[0] ||
3004 (row + h4) > s->rows;
3005 emu[1] = (col + w4) * 4 > f->linesize[1] ||
3006 (row + h4) > s->rows;
3007 if (emu[0]) {
3008 s->dst[0] = s->tmp_y;
3009 s->y_stride = 64;
3010 } else {
3011 s->dst[0] = f->data[0] + yoff;
3012 s->y_stride = f->linesize[0];
3013 }
3014 if (emu[1]) {
3015 s->dst[1] = s->tmp_uv[0];
3016 s->dst[2] = s->tmp_uv[1];
3017 s->uv_stride = 32;
3018 } else {
3019 s->dst[1] = f->data[1] + uvoff;
3020 s->dst[2] = f->data[2] + uvoff;
3021 s->uv_stride = f->linesize[1];
3022 }
3023 if (b->intra) {
3024 intra_recon(ctx, yoff, uvoff);
3025 } else {
3026 inter_recon(ctx);
3027 }
3028 if (emu[0]) {
3029 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3030
3031 for (n = 0; o < w; n++) {
3032 int bw = 64 >> n;
3033
3034 av_assert2(n <= 4);
3035 if (w & bw) {
3036 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3037 s->tmp_y + o, 64, h, 0, 0);
3038 o += bw;
3039 }
3040 }
3041 }
3042 if (emu[1]) {
3043 int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3044
3045 for (n = 1; o < w; n++) {
3046 int bw = 64 >> n;
3047
3048 av_assert2(n <= 4);
3049 if (w & bw) {
3050 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3051 s->tmp_uv[0] + o, 32, h, 0, 0);
3052 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3053 s->tmp_uv[1] + o, 32, h, 0, 0);
3054 o += bw;
3055 }
3056 }
3057 }
3058
3059 // pick filter level and find edges to apply filter to
3060 if (s->filter.level &&
3061 (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3062 [b->mode[3] != ZEROMV]) > 0) {
3063 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3064 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3065
3066 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3067 mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3068 mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3069 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3070 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3071 b->uvtx, skip_inter);
3072
3073 if (!s->filter.lim_lut[lvl]) {
3074 int sharp = s->filter.sharpness;
3075 int limit = lvl;
3076
3077 if (sharp > 0) {
3078 limit >>= (sharp + 3) >> 2;
3079 limit = FFMIN(limit, 9 - sharp);
3080 }
3081 limit = FFMAX(limit, 1);
3082
3083 s->filter.lim_lut[lvl] = limit;
3084 s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3085 }
3086 }
3087
3088 if (s->pass == 2) {
3089 s->b++;
3090 s->block += w4 * h4 * 64;
3091 s->uvblock[0] += w4 * h4 * 16;
3092 s->uvblock[1] += w4 * h4 * 16;
3093 s->eob += 4 * w4 * h4;
3094 s->uveob[0] += w4 * h4;
3095 s->uveob[1] += w4 * h4;
3096 }
3097 }
3098
decode_sb(AVCodecContext * ctx,int row,int col,struct VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl)3099 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3100 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3101 {
3102 VP9Context *s = ctx->priv_data;
3103 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3104 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3105 const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3106 s->prob.p.partition[bl][c];
3107 enum BlockPartition bp;
3108 ptrdiff_t hbs = 4 >> bl;
3109 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3110 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3111
3112 if (bl == BL_8X8) {
3113 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3114 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3115 } else if (col + hbs < s->cols) { // FIXME why not <=?
3116 if (row + hbs < s->rows) { // FIXME why not <=?
3117 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3118 switch (bp) {
3119 case PARTITION_NONE:
3120 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3121 break;
3122 case PARTITION_H:
3123 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3124 yoff += hbs * 8 * y_stride;
3125 uvoff += hbs * 4 * uv_stride;
3126 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3127 break;
3128 case PARTITION_V:
3129 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3130 yoff += hbs * 8;
3131 uvoff += hbs * 4;
3132 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3133 break;
3134 case PARTITION_SPLIT:
3135 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3136 decode_sb(ctx, row, col + hbs, lflvl,
3137 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3138 yoff += hbs * 8 * y_stride;
3139 uvoff += hbs * 4 * uv_stride;
3140 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3141 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3142 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3143 break;
3144 default:
3145 av_assert0(0);
3146 }
3147 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3148 bp = PARTITION_SPLIT;
3149 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3150 decode_sb(ctx, row, col + hbs, lflvl,
3151 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3152 } else {
3153 bp = PARTITION_H;
3154 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3155 }
3156 } else if (row + hbs < s->rows) { // FIXME why not <=?
3157 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3158 bp = PARTITION_SPLIT;
3159 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3160 yoff += hbs * 8 * y_stride;
3161 uvoff += hbs * 4 * uv_stride;
3162 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3163 } else {
3164 bp = PARTITION_V;
3165 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3166 }
3167 } else {
3168 bp = PARTITION_SPLIT;
3169 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3170 }
3171 s->counts.partition[bl][c][bp]++;
3172 }
3173
decode_sb_mem(AVCodecContext * ctx,int row,int col,struct VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl)3174 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3175 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3176 {
3177 VP9Context *s = ctx->priv_data;
3178 VP9Block *b = s->b;
3179 ptrdiff_t hbs = 4 >> bl;
3180 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3181 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3182
3183 if (bl == BL_8X8) {
3184 av_assert2(b->bl == BL_8X8);
3185 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3186 } else if (s->b->bl == bl) {
3187 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3188 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3189 yoff += hbs * 8 * y_stride;
3190 uvoff += hbs * 4 * uv_stride;
3191 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3192 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3193 yoff += hbs * 8;
3194 uvoff += hbs * 4;
3195 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3196 }
3197 } else {
3198 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3199 if (col + hbs < s->cols) { // FIXME why not <=?
3200 if (row + hbs < s->rows) {
3201 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3202 uvoff + 4 * hbs, bl + 1);
3203 yoff += hbs * 8 * y_stride;
3204 uvoff += hbs * 4 * uv_stride;
3205 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3206 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3207 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3208 } else {
3209 yoff += hbs * 8;
3210 uvoff += hbs * 4;
3211 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3212 }
3213 } else if (row + hbs < s->rows) {
3214 yoff += hbs * 8 * y_stride;
3215 uvoff += hbs * 4 * uv_stride;
3216 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3217 }
3218 }
3219 }
3220
loopfilter_sb(AVCodecContext * ctx,struct VP9Filter * lflvl,int row,int col,ptrdiff_t yoff,ptrdiff_t uvoff)3221 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3222 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3223 {
3224 VP9Context *s = ctx->priv_data;
3225 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3226 uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3227 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3228 int y, x, p;
3229
3230 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3231 // if you think of them as acting on a 8x8 block max, we can interleave
3232 // each v/h within the single x loop, but that only works if we work on
3233 // 8 pixel blocks, and we won't always do that (we want at least 16px
3234 // to use SSE2 optimizations, perhaps 32 for AVX2)
3235
3236 // filter edges between columns, Y plane (e.g. block1 | block2)
3237 for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3238 uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3239 uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3240 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3241 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3242 unsigned hm = hm1 | hm2 | hm13 | hm23;
3243
3244 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3245 if (hm1 & x) {
3246 int L = *l, H = L >> 4;
3247 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3248
3249 if (col || x > 1) {
3250 if (hmask1[0] & x) {
3251 if (hmask2[0] & x) {
3252 av_assert2(l[8] == L);
3253 s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3254 } else {
3255 s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3256 }
3257 } else if (hm2 & x) {
3258 L = l[8];
3259 H |= (L >> 4) << 8;
3260 E |= s->filter.mblim_lut[L] << 8;
3261 I |= s->filter.lim_lut[L] << 8;
3262 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3263 [!!(hmask2[1] & x)]
3264 [0](ptr, ls_y, E, I, H);
3265 } else {
3266 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3267 [0](ptr, ls_y, E, I, H);
3268 }
3269 }
3270 } else if (hm2 & x) {
3271 int L = l[8], H = L >> 4;
3272 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3273
3274 if (col || x > 1) {
3275 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3276 [0](ptr + 8 * ls_y, ls_y, E, I, H);
3277 }
3278 }
3279 if (hm13 & x) {
3280 int L = *l, H = L >> 4;
3281 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3282
3283 if (hm23 & x) {
3284 L = l[8];
3285 H |= (L >> 4) << 8;
3286 E |= s->filter.mblim_lut[L] << 8;
3287 I |= s->filter.lim_lut[L] << 8;
3288 s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3289 } else {
3290 s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3291 }
3292 } else if (hm23 & x) {
3293 int L = l[8], H = L >> 4;
3294 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3295
3296 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3297 }
3298 }
3299 }
3300
3301 // block1
3302 // filter edges between rows, Y plane (e.g. ------)
3303 // block2
3304 dst = f->data[0] + yoff;
3305 lvl = lflvl->level;
3306 for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3307 uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3308 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3309
3310 for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3311 if (row || y) {
3312 if (vm & x) {
3313 int L = *l, H = L >> 4;
3314 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3315
3316 if (vmask[0] & x) {
3317 if (vmask[0] & (x << 1)) {
3318 av_assert2(l[1] == L);
3319 s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3320 } else {
3321 s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3322 }
3323 } else if (vm & (x << 1)) {
3324 L = l[1];
3325 H |= (L >> 4) << 8;
3326 E |= s->filter.mblim_lut[L] << 8;
3327 I |= s->filter.lim_lut[L] << 8;
3328 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3329 [!!(vmask[1] & (x << 1))]
3330 [1](ptr, ls_y, E, I, H);
3331 } else {
3332 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3333 [1](ptr, ls_y, E, I, H);
3334 }
3335 } else if (vm & (x << 1)) {
3336 int L = l[1], H = L >> 4;
3337 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3338
3339 s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3340 [1](ptr + 8, ls_y, E, I, H);
3341 }
3342 }
3343 if (vm3 & x) {
3344 int L = *l, H = L >> 4;
3345 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3346
3347 if (vm3 & (x << 1)) {
3348 L = l[1];
3349 H |= (L >> 4) << 8;
3350 E |= s->filter.mblim_lut[L] << 8;
3351 I |= s->filter.lim_lut[L] << 8;
3352 s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3353 } else {
3354 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3355 }
3356 } else if (vm3 & (x << 1)) {
3357 int L = l[1], H = L >> 4;
3358 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3359
3360 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3361 }
3362 }
3363 }
3364
3365 // same principle but for U/V planes
3366 for (p = 0; p < 2; p++) {
3367 lvl = lflvl->level;
3368 dst = f->data[1 + p] + uvoff;
3369 for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3370 uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3371 uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3372 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3373 unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3374
3375 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3376 if (col || x > 1) {
3377 if (hm1 & x) {
3378 int L = *l, H = L >> 4;
3379 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3380
3381 if (hmask1[0] & x) {
3382 if (hmask2[0] & x) {
3383 av_assert2(l[16] == L);
3384 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3385 } else {
3386 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3387 }
3388 } else if (hm2 & x) {
3389 L = l[16];
3390 H |= (L >> 4) << 8;
3391 E |= s->filter.mblim_lut[L] << 8;
3392 I |= s->filter.lim_lut[L] << 8;
3393 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3394 [!!(hmask2[1] & x)]
3395 [0](ptr, ls_uv, E, I, H);
3396 } else {
3397 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3398 [0](ptr, ls_uv, E, I, H);
3399 }
3400 } else if (hm2 & x) {
3401 int L = l[16], H = L >> 4;
3402 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3403
3404 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3405 [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3406 }
3407 }
3408 if (x & 0xAA)
3409 l += 2;
3410 }
3411 }
3412 lvl = lflvl->level;
3413 dst = f->data[1 + p] + uvoff;
3414 for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3415 uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3416 unsigned vm = vmask[0] | vmask[1] | vmask[2];
3417
3418 for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3419 if (row || y) {
3420 if (vm & x) {
3421 int L = *l, H = L >> 4;
3422 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3423
3424 if (vmask[0] & x) {
3425 if (vmask[0] & (x << 2)) {
3426 av_assert2(l[2] == L);
3427 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3428 } else {
3429 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3430 }
3431 } else if (vm & (x << 2)) {
3432 L = l[2];
3433 H |= (L >> 4) << 8;
3434 E |= s->filter.mblim_lut[L] << 8;
3435 I |= s->filter.lim_lut[L] << 8;
3436 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3437 [!!(vmask[1] & (x << 2))]
3438 [1](ptr, ls_uv, E, I, H);
3439 } else {
3440 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3441 [1](ptr, ls_uv, E, I, H);
3442 }
3443 } else if (vm & (x << 2)) {
3444 int L = l[2], H = L >> 4;
3445 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3446
3447 s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3448 [1](ptr + 8, ls_uv, E, I, H);
3449 }
3450 }
3451 }
3452 if (y & 1)
3453 lvl += 16;
3454 }
3455 }
3456 }
3457
set_tile_offset(int * start,int * end,int idx,int log2_n,int n)3458 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3459 {
3460 int sb_start = ( idx * n) >> log2_n;
3461 int sb_end = ((idx + 1) * n) >> log2_n;
3462 *start = FFMIN(sb_start, n) << 3;
3463 *end = FFMIN(sb_end, n) << 3;
3464 }
3465
adapt_prob(uint8_t * p,unsigned ct0,unsigned ct1,int max_count,int update_factor)3466 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3467 int max_count, int update_factor)
3468 {
3469 unsigned ct = ct0 + ct1, p2, p1;
3470
3471 if (!ct)
3472 return;
3473
3474 p1 = *p;
3475 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3476 p2 = av_clip(p2, 1, 255);
3477 ct = FFMIN(ct, max_count);
3478 update_factor = FASTDIV(update_factor * ct, max_count);
3479
3480 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3481 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3482 }
3483
adapt_probs(VP9Context * s)3484 static void adapt_probs(VP9Context *s)
3485 {
3486 int i, j, k, l, m;
3487 prob_context *p = &s->prob_ctx[s->framectxid].p;
3488 int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3489
3490 // coefficients
3491 for (i = 0; i < 4; i++)
3492 for (j = 0; j < 2; j++)
3493 for (k = 0; k < 2; k++)
3494 for (l = 0; l < 6; l++)
3495 for (m = 0; m < 6; m++) {
3496 uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3497 unsigned *e = s->counts.eob[i][j][k][l][m];
3498 unsigned *c = s->counts.coef[i][j][k][l][m];
3499
3500 if (l == 0 && m >= 3) // dc only has 3 pt
3501 break;
3502
3503 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3504 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3505 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3506 }
3507
3508 if (s->keyframe || s->intraonly) {
3509 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3510 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3511 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3512 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3513 return;
3514 }
3515
3516 // skip flag
3517 for (i = 0; i < 3; i++)
3518 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3519
3520 // intra/inter flag
3521 for (i = 0; i < 4; i++)
3522 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3523
3524 // comppred flag
3525 if (s->comppredmode == PRED_SWITCHABLE) {
3526 for (i = 0; i < 5; i++)
3527 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3528 }
3529
3530 // reference frames
3531 if (s->comppredmode != PRED_SINGLEREF) {
3532 for (i = 0; i < 5; i++)
3533 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3534 s->counts.comp_ref[i][1], 20, 128);
3535 }
3536
3537 if (s->comppredmode != PRED_COMPREF) {
3538 for (i = 0; i < 5; i++) {
3539 uint8_t *pp = p->single_ref[i];
3540 unsigned (*c)[2] = s->counts.single_ref[i];
3541
3542 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3543 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3544 }
3545 }
3546
3547 // block partitioning
3548 for (i = 0; i < 4; i++)
3549 for (j = 0; j < 4; j++) {
3550 uint8_t *pp = p->partition[i][j];
3551 unsigned *c = s->counts.partition[i][j];
3552
3553 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3554 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3555 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3556 }
3557
3558 // tx size
3559 if (s->txfmmode == TX_SWITCHABLE) {
3560 for (i = 0; i < 2; i++) {
3561 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3562
3563 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3564 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3565 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3566 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3567 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3568 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3569 }
3570 }
3571
3572 // interpolation filter
3573 if (s->filtermode == FILTER_SWITCHABLE) {
3574 for (i = 0; i < 4; i++) {
3575 uint8_t *pp = p->filter[i];
3576 unsigned *c = s->counts.filter[i];
3577
3578 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3579 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3580 }
3581 }
3582
3583 // inter modes
3584 for (i = 0; i < 7; i++) {
3585 uint8_t *pp = p->mv_mode[i];
3586 unsigned *c = s->counts.mv_mode[i];
3587
3588 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3589 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3590 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3591 }
3592
3593 // mv joints
3594 {
3595 uint8_t *pp = p->mv_joint;
3596 unsigned *c = s->counts.mv_joint;
3597
3598 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3599 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3600 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3601 }
3602
3603 // mv components
3604 for (i = 0; i < 2; i++) {
3605 uint8_t *pp;
3606 unsigned *c, (*c2)[2], sum;
3607
3608 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3609 s->counts.mv_comp[i].sign[1], 20, 128);
3610
3611 pp = p->mv_comp[i].classes;
3612 c = s->counts.mv_comp[i].classes;
3613 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3614 adapt_prob(&pp[0], c[0], sum, 20, 128);
3615 sum -= c[1];
3616 adapt_prob(&pp[1], c[1], sum, 20, 128);
3617 sum -= c[2] + c[3];
3618 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3619 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3620 sum -= c[4] + c[5];
3621 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3622 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3623 sum -= c[6];
3624 adapt_prob(&pp[6], c[6], sum, 20, 128);
3625 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3626 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3627 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3628
3629 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3630 s->counts.mv_comp[i].class0[1], 20, 128);
3631 pp = p->mv_comp[i].bits;
3632 c2 = s->counts.mv_comp[i].bits;
3633 for (j = 0; j < 10; j++)
3634 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3635
3636 for (j = 0; j < 2; j++) {
3637 pp = p->mv_comp[i].class0_fp[j];
3638 c = s->counts.mv_comp[i].class0_fp[j];
3639 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3640 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3641 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3642 }
3643 pp = p->mv_comp[i].fp;
3644 c = s->counts.mv_comp[i].fp;
3645 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3646 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3647 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3648
3649 if (s->highprecisionmvs) {
3650 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3651 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3652 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3653 s->counts.mv_comp[i].hp[1], 20, 128);
3654 }
3655 }
3656
3657 // y intra modes
3658 for (i = 0; i < 4; i++) {
3659 uint8_t *pp = p->y_mode[i];
3660 unsigned *c = s->counts.y_mode[i], sum, s2;
3661
3662 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3663 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3664 sum -= c[TM_VP8_PRED];
3665 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3666 sum -= c[VERT_PRED];
3667 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3668 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3669 sum -= s2;
3670 adapt_prob(&pp[3], s2, sum, 20, 128);
3671 s2 -= c[HOR_PRED];
3672 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3673 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3674 sum -= c[DIAG_DOWN_LEFT_PRED];
3675 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3676 sum -= c[VERT_LEFT_PRED];
3677 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3678 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3679 }
3680
3681 // uv intra modes
3682 for (i = 0; i < 10; i++) {
3683 uint8_t *pp = p->uv_mode[i];
3684 unsigned *c = s->counts.uv_mode[i], sum, s2;
3685
3686 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3687 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3688 sum -= c[TM_VP8_PRED];
3689 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3690 sum -= c[VERT_PRED];
3691 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3692 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3693 sum -= s2;
3694 adapt_prob(&pp[3], s2, sum, 20, 128);
3695 s2 -= c[HOR_PRED];
3696 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3697 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3698 sum -= c[DIAG_DOWN_LEFT_PRED];
3699 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3700 sum -= c[VERT_LEFT_PRED];
3701 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3702 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3703 }
3704 }
3705
free_buffers(VP9Context * s)3706 static void free_buffers(VP9Context *s)
3707 {
3708 av_freep(&s->intra_pred_data[0]);
3709 av_freep(&s->b_base);
3710 av_freep(&s->block_base);
3711 }
3712
vp9_decode_free(AVCodecContext * ctx)3713 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3714 {
3715 VP9Context *s = ctx->priv_data;
3716 int i;
3717
3718 for (i = 0; i < 2; i++) {
3719 if (s->frames[i].tf.f->data[0])
3720 vp9_unref_frame(ctx, &s->frames[i]);
3721 av_frame_free(&s->frames[i].tf.f);
3722 }
3723 for (i = 0; i < 8; i++) {
3724 if (s->refs[i].f->data[0])
3725 ff_thread_release_buffer(ctx, &s->refs[i]);
3726 av_frame_free(&s->refs[i].f);
3727 if (s->next_refs[i].f->data[0])
3728 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3729 av_frame_free(&s->next_refs[i].f);
3730 }
3731 free_buffers(s);
3732 av_freep(&s->c_b);
3733 s->c_b_size = 0;
3734
3735 return 0;
3736 }
3737
3738
vp9_decode_frame(AVCodecContext * ctx,void * frame,int * got_frame,AVPacket * pkt)3739 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3740 int *got_frame, AVPacket *pkt)
3741 {
3742 const uint8_t *data = pkt->data;
3743 int size = pkt->size;
3744 VP9Context *s = ctx->priv_data;
3745 int res, tile_row, tile_col, i;
3746 int ref = 0;
3747 int row, col;
3748 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3749 AVFrame *f;
3750
3751 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3752 return res;
3753 } else if (res == 0) {
3754 if (!s->refs[ref].f->data[0]) {
3755 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3756 return AVERROR_INVALIDDATA;
3757 }
3758 if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3759 return res;
3760 *got_frame = 1;
3761 return 0;
3762 }
3763 data += res;
3764 size -= res;
3765
3766 if (s->frames[LAST_FRAME].tf.f->data[0])
3767 vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3768 if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3769 (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3770 return res;
3771 if (s->frames[CUR_FRAME].tf.f->data[0])
3772 vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3773 if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3774 return res;
3775 f = s->frames[CUR_FRAME].tf.f;
3776 f->key_frame = s->keyframe;
3777 f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3778 ls_y = f->linesize[0];
3779 ls_uv =f->linesize[1];
3780
3781 // ref frame setup
3782 for (i = 0; i < 8; i++) {
3783 if (s->next_refs[i].f->data[0])
3784 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3785 if (s->refreshrefmask & (1 << i)) {
3786 res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3787 } else {
3788 res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3789 }
3790 if (res < 0)
3791 return res;
3792 }
3793
3794 // main tile decode loop
3795 memset(s->above_partition_ctx, 0, s->cols);
3796 memset(s->above_skip_ctx, 0, s->cols);
3797 if (s->keyframe || s->intraonly) {
3798 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3799 } else {
3800 memset(s->above_mode_ctx, NEARESTMV, s->cols);
3801 }
3802 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3803 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3804 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3805 memset(s->above_segpred_ctx, 0, s->cols);
3806 s->pass = s->uses_2pass =
3807 ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3808 if ((res = update_block_buffers(ctx)) < 0) {
3809 av_log(ctx, AV_LOG_ERROR,
3810 "Failed to allocate block buffers\n");
3811 return res;
3812 }
3813 if (s->refreshctx && s->parallelmode) {
3814 int j, k, l, m;
3815
3816 for (i = 0; i < 4; i++) {
3817 for (j = 0; j < 2; j++)
3818 for (k = 0; k < 2; k++)
3819 for (l = 0; l < 6; l++)
3820 for (m = 0; m < 6; m++)
3821 memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3822 s->prob.coef[i][j][k][l][m], 3);
3823 if (s->txfmmode == i)
3824 break;
3825 }
3826 s->prob_ctx[s->framectxid].p = s->prob.p;
3827 ff_thread_finish_setup(ctx);
3828 }
3829
3830 do {
3831 yoff = uvoff = 0;
3832 s->b = s->b_base;
3833 s->block = s->block_base;
3834 s->uvblock[0] = s->uvblock_base[0];
3835 s->uvblock[1] = s->uvblock_base[1];
3836 s->eob = s->eob_base;
3837 s->uveob[0] = s->uveob_base[0];
3838 s->uveob[1] = s->uveob_base[1];
3839
3840 for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3841 set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3842 tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3843 if (s->pass != 2) {
3844 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3845 unsigned tile_size;
3846
3847 if (tile_col == s->tiling.tile_cols - 1 &&
3848 tile_row == s->tiling.tile_rows - 1) {
3849 tile_size = size;
3850 } else {
3851 tile_size = AV_RB32(data);
3852 data += 4;
3853 size -= 4;
3854 }
3855 if (tile_size > size) {
3856 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3857 return AVERROR_INVALIDDATA;
3858 }
3859 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3860 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3861 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3862 return AVERROR_INVALIDDATA;
3863 }
3864 data += tile_size;
3865 size -= tile_size;
3866 }
3867 }
3868
3869 for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3870 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3871 struct VP9Filter *lflvl_ptr = s->lflvl;
3872 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3873
3874 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3875 set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3876 tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3877
3878 if (s->pass != 2) {
3879 memset(s->left_partition_ctx, 0, 8);
3880 memset(s->left_skip_ctx, 0, 8);
3881 if (s->keyframe || s->intraonly) {
3882 memset(s->left_mode_ctx, DC_PRED, 16);
3883 } else {
3884 memset(s->left_mode_ctx, NEARESTMV, 8);
3885 }
3886 memset(s->left_y_nnz_ctx, 0, 16);
3887 memset(s->left_uv_nnz_ctx, 0, 16);
3888 memset(s->left_segpred_ctx, 0, 8);
3889
3890 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3891 }
3892
3893 for (col = s->tiling.tile_col_start;
3894 col < s->tiling.tile_col_end;
3895 col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3896 // FIXME integrate with lf code (i.e. zero after each
3897 // use, similar to invtxfm coefficients, or similar)
3898 if (s->pass != 1) {
3899 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3900 }
3901
3902 if (s->pass == 2) {
3903 decode_sb_mem(ctx, row, col, lflvl_ptr,
3904 yoff2, uvoff2, BL_64X64);
3905 } else {
3906 decode_sb(ctx, row, col, lflvl_ptr,
3907 yoff2, uvoff2, BL_64X64);
3908 }
3909 }
3910 if (s->pass != 2) {
3911 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3912 }
3913 }
3914
3915 if (s->pass == 1) {
3916 continue;
3917 }
3918
3919 // backup pre-loopfilter reconstruction data for intra
3920 // prediction of next row of sb64s
3921 if (row + 8 < s->rows) {
3922 memcpy(s->intra_pred_data[0],
3923 f->data[0] + yoff + 63 * ls_y,
3924 8 * s->cols);
3925 memcpy(s->intra_pred_data[1],
3926 f->data[1] + uvoff + 31 * ls_uv,
3927 4 * s->cols);
3928 memcpy(s->intra_pred_data[2],
3929 f->data[2] + uvoff + 31 * ls_uv,
3930 4 * s->cols);
3931 }
3932
3933 // loopfilter one row
3934 if (s->filter.level) {
3935 yoff2 = yoff;
3936 uvoff2 = uvoff;
3937 lflvl_ptr = s->lflvl;
3938 for (col = 0; col < s->cols;
3939 col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3940 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3941 }
3942 }
3943
3944 // FIXME maybe we can make this more finegrained by running the
3945 // loopfilter per-block instead of after each sbrow
3946 // In fact that would also make intra pred left preparation easier?
3947 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3948 }
3949 }
3950
3951 if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3952 adapt_probs(s);
3953 ff_thread_finish_setup(ctx);
3954 }
3955 } while (s->pass++ == 1);
3956 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3957
3958 // ref frame setup
3959 for (i = 0; i < 8; i++) {
3960 if (s->refs[i].f->data[0])
3961 ff_thread_release_buffer(ctx, &s->refs[i]);
3962 ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3963 }
3964
3965 if (!s->invisible) {
3966 if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3967 return res;
3968 *got_frame = 1;
3969 }
3970
3971 return 0;
3972 }
3973
vp9_decode_flush(AVCodecContext * ctx)3974 static void vp9_decode_flush(AVCodecContext *ctx)
3975 {
3976 VP9Context *s = ctx->priv_data;
3977 int i;
3978
3979 for (i = 0; i < 2; i++)
3980 vp9_unref_frame(ctx, &s->frames[i]);
3981 for (i = 0; i < 8; i++)
3982 ff_thread_release_buffer(ctx, &s->refs[i]);
3983 }
3984
init_frames(AVCodecContext * ctx)3985 static int init_frames(AVCodecContext *ctx)
3986 {
3987 VP9Context *s = ctx->priv_data;
3988 int i;
3989
3990 for (i = 0; i < 2; i++) {
3991 s->frames[i].tf.f = av_frame_alloc();
3992 if (!s->frames[i].tf.f) {
3993 vp9_decode_free(ctx);
3994 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3995 return AVERROR(ENOMEM);
3996 }
3997 }
3998 for (i = 0; i < 8; i++) {
3999 s->refs[i].f = av_frame_alloc();
4000 s->next_refs[i].f = av_frame_alloc();
4001 if (!s->refs[i].f || !s->next_refs[i].f) {
4002 vp9_decode_free(ctx);
4003 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4004 return AVERROR(ENOMEM);
4005 }
4006 }
4007
4008 return 0;
4009 }
4010
vp9_decode_init(AVCodecContext * ctx)4011 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4012 {
4013 VP9Context *s = ctx->priv_data;
4014
4015 ctx->internal->allocate_progress = 1;
4016 ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4017 ff_vp9dsp_init(&s->dsp);
4018 ff_videodsp_init(&s->vdsp, 8);
4019 s->filter.sharpness = -1;
4020
4021 return init_frames(ctx);
4022 }
4023
vp9_decode_init_thread_copy(AVCodecContext * avctx)4024 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4025 {
4026 return init_frames(avctx);
4027 }
4028
vp9_decode_update_thread_context(AVCodecContext * dst,const AVCodecContext * src)4029 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4030 {
4031 int i, res;
4032 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4033
4034 // detect size changes in other threads
4035 if (s->intra_pred_data[0] &&
4036 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4037 free_buffers(s);
4038 }
4039
4040 for (i = 0; i < 2; i++) {
4041 if (s->frames[i].tf.f->data[0])
4042 vp9_unref_frame(dst, &s->frames[i]);
4043 if (ssrc->frames[i].tf.f->data[0]) {
4044 if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4045 return res;
4046 }
4047 }
4048 for (i = 0; i < 8; i++) {
4049 if (s->refs[i].f->data[0])
4050 ff_thread_release_buffer(dst, &s->refs[i]);
4051 if (ssrc->next_refs[i].f->data[0]) {
4052 if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4053 return res;
4054 }
4055 }
4056
4057 s->invisible = ssrc->invisible;
4058 s->keyframe = ssrc->keyframe;
4059 s->uses_2pass = ssrc->uses_2pass;
4060 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4061 memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4062 if (ssrc->segmentation.enabled) {
4063 memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4064 sizeof(s->segmentation.feat));
4065 }
4066
4067 return 0;
4068 }
4069
4070 AVCodec ff_vp9_decoder = {
4071 .name = "vp9",
4072 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4073 .type = AVMEDIA_TYPE_VIDEO,
4074 .id = AV_CODEC_ID_VP9,
4075 .priv_data_size = sizeof(VP9Context),
4076 .init = vp9_decode_init,
4077 .close = vp9_decode_free,
4078 .decode = vp9_decode_frame,
4079 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4080 .flush = vp9_decode_flush,
4081 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4082 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4083 };
4084