1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 
27 #include "internal.h"
28 
29 #include "thread.h"
30 #include "videodsp.h"
31 #include "vp56.h"
32 #include "vp9.h"
33 #include "vp9data.h"
34 #include "vp9dsp.h"
35 #include "libavutil/avassert.h"
36 
37 #define VP9_SYNCCODE 0x498342
38 
39 enum CompPredMode {
40     PRED_SINGLEREF,
41     PRED_COMPREF,
42     PRED_SWITCHABLE,
43 };
44 
45 enum BlockLevel {
46     BL_64X64,
47     BL_32X32,
48     BL_16X16,
49     BL_8X8,
50 };
51 
52 enum BlockSize {
53     BS_64x64,
54     BS_64x32,
55     BS_32x64,
56     BS_32x32,
57     BS_32x16,
58     BS_16x32,
59     BS_16x16,
60     BS_16x8,
61     BS_8x16,
62     BS_8x8,
63     BS_8x4,
64     BS_4x8,
65     BS_4x4,
66     N_BS_SIZES,
67 };
68 
69 struct VP9mvrefPair {
70     VP56mv mv[2];
71     int8_t ref[2];
72 };
73 
74 typedef struct VP9Frame {
75     ThreadFrame tf;
76     AVBufferRef *extradata;
77     uint8_t *segmentation_map;
78     struct VP9mvrefPair *mv;
79 } VP9Frame;
80 
81 struct VP9Filter {
82     uint8_t level[8 * 8];
83     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
84                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
85 };
86 
87 typedef struct VP9Block {
88     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
89     enum FilterMode filter;
90     VP56mv mv[4 /* b_idx */][2 /* ref */];
91     enum BlockSize bs;
92     enum TxfmMode tx, uvtx;
93     enum BlockLevel bl;
94     enum BlockPartition bp;
95 } VP9Block;
96 
97 typedef struct VP9Context {
98     VP9DSPContext dsp;
99     VideoDSPContext vdsp;
100     GetBitContext gb;
101     VP56RangeCoder c;
102     VP56RangeCoder *c_b;
103     unsigned c_b_size;
104     VP9Block *b_base, *b;
105     int pass, uses_2pass, last_uses_2pass;
106     int row, row7, col, col7;
107     uint8_t *dst[3];
108     ptrdiff_t y_stride, uv_stride;
109 
110     // bitstream header
111     uint8_t profile;
112     uint8_t keyframe, last_keyframe;
113     uint8_t invisible;
114     uint8_t use_last_frame_mvs;
115     uint8_t errorres;
116     uint8_t colorspace;
117     uint8_t fullrange;
118     uint8_t intraonly;
119     uint8_t resetctx;
120     uint8_t refreshrefmask;
121     uint8_t highprecisionmvs;
122     enum FilterMode filtermode;
123     uint8_t allowcompinter;
124     uint8_t fixcompref;
125     uint8_t refreshctx;
126     uint8_t parallelmode;
127     uint8_t framectxid;
128     uint8_t refidx[3];
129     uint8_t signbias[3];
130     uint8_t varcompref[2];
131     ThreadFrame refs[8], next_refs[8];
132 #define CUR_FRAME 0
133 #define LAST_FRAME 1
134     VP9Frame frames[2];
135 
136     struct {
137         uint8_t level;
138         int8_t sharpness;
139         uint8_t lim_lut[64];
140         uint8_t mblim_lut[64];
141     } filter;
142     struct {
143         uint8_t enabled;
144         int8_t mode[2];
145         int8_t ref[4];
146     } lf_delta;
147     uint8_t yac_qi;
148     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
149     uint8_t lossless;
150     struct {
151         uint8_t enabled;
152         uint8_t temporal;
153         uint8_t absolute_vals;
154         uint8_t update_map;
155         struct {
156             uint8_t q_enabled;
157             uint8_t lf_enabled;
158             uint8_t ref_enabled;
159             uint8_t skip_enabled;
160             uint8_t ref_val;
161             int16_t q_val;
162             int8_t lf_val;
163             int16_t qmul[2][2];
164             uint8_t lflvl[4][2];
165         } feat[8];
166     } segmentation;
167     struct {
168         unsigned log2_tile_cols, log2_tile_rows;
169         unsigned tile_cols, tile_rows;
170         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
171     } tiling;
172     unsigned sb_cols, sb_rows, rows, cols;
173     struct {
174         prob_context p;
175         uint8_t coef[4][2][2][6][6][3];
176     } prob_ctx[4];
177     struct {
178         prob_context p;
179         uint8_t coef[4][2][2][6][6][11];
180         uint8_t seg[7];
181         uint8_t segpred[3];
182     } prob;
183     struct {
184         unsigned y_mode[4][10];
185         unsigned uv_mode[10][10];
186         unsigned filter[4][3];
187         unsigned mv_mode[7][4];
188         unsigned intra[4][2];
189         unsigned comp[5][2];
190         unsigned single_ref[5][2][2];
191         unsigned comp_ref[5][2];
192         unsigned tx32p[2][4];
193         unsigned tx16p[2][3];
194         unsigned tx8p[2][2];
195         unsigned skip[3][2];
196         unsigned mv_joint[4];
197         struct {
198             unsigned sign[2];
199             unsigned classes[11];
200             unsigned class0[2];
201             unsigned bits[10][2];
202             unsigned class0_fp[2][4];
203             unsigned fp[4];
204             unsigned class0_hp[2];
205             unsigned hp[2];
206         } mv_comp[2];
207         unsigned partition[4][4][4];
208         unsigned coef[4][2][2][6][6][3];
209         unsigned eob[4][2][2][6][6][2];
210     } counts;
211     enum TxfmMode txfmmode;
212     enum CompPredMode comppredmode;
213 
214     // contextual (left/above) cache
215     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
216     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
217     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
218     DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
219     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
220     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
221     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
222     DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
223     DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
224     DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
225     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
226     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
227     uint8_t *above_partition_ctx;
228     uint8_t *above_mode_ctx;
229     // FIXME maybe merge some of the below in a flags field?
230     uint8_t *above_y_nnz_ctx;
231     uint8_t *above_uv_nnz_ctx[2];
232     uint8_t *above_skip_ctx; // 1bit
233     uint8_t *above_txfm_ctx; // 2bit
234     uint8_t *above_segpred_ctx; // 1bit
235     uint8_t *above_intra_ctx; // 1bit
236     uint8_t *above_comp_ctx; // 1bit
237     uint8_t *above_ref_ctx; // 2bit
238     uint8_t *above_filter_ctx;
239     VP56mv (*above_mv_ctx)[2];
240 
241     // whole-frame cache
242     uint8_t *intra_pred_data[3];
243     struct VP9Filter *lflvl;
244     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
245 
246     // block reconstruction intermediates
247     int block_alloc_using_2pass;
248     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
249     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
250     struct { int x, y; } min_mv, max_mv;
251     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
252     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
253 } VP9Context;
254 
255 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
256     {
257         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
258         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
259     }, {
260         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
261         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
262     }
263 };
264 
vp9_alloc_frame(AVCodecContext * ctx,VP9Frame * f)265 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
266 {
267     VP9Context *s = ctx->priv_data;
268     int ret, sz;
269 
270     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
271         return ret;
272     sz = 64 * s->sb_cols * s->sb_rows;
273     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
274         ff_thread_release_buffer(ctx, &f->tf);
275         return AVERROR(ENOMEM);
276     }
277 
278     f->segmentation_map = f->extradata->data;
279     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
280 
281     // retain segmentation map if it doesn't update
282     if (s->segmentation.enabled && !s->segmentation.update_map &&
283         !s->intraonly && !s->keyframe && !s->errorres) {
284         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
285     }
286 
287     return 0;
288 }
289 
vp9_unref_frame(AVCodecContext * ctx,VP9Frame * f)290 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
291 {
292     ff_thread_release_buffer(ctx, &f->tf);
293     av_buffer_unref(&f->extradata);
294 }
295 
vp9_ref_frame(AVCodecContext * ctx,VP9Frame * dst,VP9Frame * src)296 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
297 {
298     int res;
299 
300     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
301         return res;
302     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
303         vp9_unref_frame(ctx, dst);
304         return AVERROR(ENOMEM);
305     }
306 
307     dst->segmentation_map = src->segmentation_map;
308     dst->mv = src->mv;
309 
310     return 0;
311 }
312 
update_size(AVCodecContext * ctx,int w,int h)313 static int update_size(AVCodecContext *ctx, int w, int h)
314 {
315     VP9Context *s = ctx->priv_data;
316     uint8_t *p;
317 
318     av_assert0(w > 0 && h > 0);
319 
320     if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
321         return 0;
322 
323     ctx->width  = w;
324     ctx->height = h;
325     s->sb_cols  = (w + 63) >> 6;
326     s->sb_rows  = (h + 63) >> 6;
327     s->cols     = (w + 7) >> 3;
328     s->rows     = (h + 7) >> 3;
329 
330 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
331     av_freep(&s->intra_pred_data[0]);
332     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
333     if (!p)
334         return AVERROR(ENOMEM);
335     assign(s->intra_pred_data[0],  uint8_t *,             64);
336     assign(s->intra_pred_data[1],  uint8_t *,             32);
337     assign(s->intra_pred_data[2],  uint8_t *,             32);
338     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
339     assign(s->above_mode_ctx,      uint8_t *,             16);
340     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
341     assign(s->above_partition_ctx, uint8_t *,              8);
342     assign(s->above_skip_ctx,      uint8_t *,              8);
343     assign(s->above_txfm_ctx,      uint8_t *,              8);
344     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
345     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
346     assign(s->above_segpred_ctx,   uint8_t *,              8);
347     assign(s->above_intra_ctx,     uint8_t *,              8);
348     assign(s->above_comp_ctx,      uint8_t *,              8);
349     assign(s->above_ref_ctx,       uint8_t *,              8);
350     assign(s->above_filter_ctx,    uint8_t *,              8);
351     assign(s->lflvl,               struct VP9Filter *,     1);
352 #undef assign
353 
354     // these will be re-allocated a little later
355     av_freep(&s->b_base);
356     av_freep(&s->block_base);
357 
358     return 0;
359 }
360 
update_block_buffers(AVCodecContext * ctx)361 static int update_block_buffers(AVCodecContext *ctx)
362 {
363     VP9Context *s = ctx->priv_data;
364 
365     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
366         return 0;
367 
368     av_free(s->b_base);
369     av_free(s->block_base);
370     if (s->uses_2pass) {
371         int sbs = s->sb_cols * s->sb_rows;
372 
373         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
374         s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
375         if (!s->b_base || !s->block_base)
376             return AVERROR(ENOMEM);
377         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
378         s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
379         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
380         s->uveob_base[0] = s->eob_base + 256 * sbs;
381         s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
382     } else {
383         s->b_base = av_malloc(sizeof(VP9Block));
384         s->block_base = av_mallocz((64 * 64 + 128) * 3);
385         if (!s->b_base || !s->block_base)
386             return AVERROR(ENOMEM);
387         s->uvblock_base[0] = s->block_base + 64 * 64;
388         s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
389         s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
390         s->uveob_base[0] = s->eob_base + 256;
391         s->uveob_base[1] = s->uveob_base[0] + 64;
392     }
393     s->block_alloc_using_2pass = s->uses_2pass;
394 
395     return 0;
396 }
397 
398 // for some reason the sign bit is at the end, not the start, of a bit sequence
get_sbits_inv(GetBitContext * gb,int n)399 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
400 {
401     int v = get_bits(gb, n);
402     return get_bits1(gb) ? -v : v;
403 }
404 
inv_recenter_nonneg(int v,int m)405 static av_always_inline int inv_recenter_nonneg(int v, int m)
406 {
407     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
408 }
409 
410 // differential forward probability updates
update_prob(VP56RangeCoder * c,int p)411 static int update_prob(VP56RangeCoder *c, int p)
412 {
413     static const int inv_map_table[254] = {
414           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
415         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
416          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
417          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
418          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
419          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
420          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
421          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
422         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
423         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
424         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
425         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
426         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
427         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
428         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
429         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
430         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
431         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
432         252, 253,
433     };
434     int d;
435 
436     /* This code is trying to do a differential probability update. For a
437      * current probability A in the range [1, 255], the difference to a new
438      * probability of any value can be expressed differentially as 1-A,255-A
439      * where some part of this (absolute range) exists both in positive as
440      * well as the negative part, whereas another part only exists in one
441      * half. We're trying to code this shared part differentially, i.e.
442      * times two where the value of the lowest bit specifies the sign, and
443      * the single part is then coded on top of this. This absolute difference
444      * then again has a value of [0,254], but a bigger value in this range
445      * indicates that we're further away from the original value A, so we
446      * can code this as a VLC code, since higher values are increasingly
447      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
448      * updates vs. the 'fine, exact' updates further down the range, which
449      * adds one extra dimension to this differential update model. */
450 
451     if (!vp8_rac_get(c)) {
452         d = vp8_rac_get_uint(c, 4) + 0;
453     } else if (!vp8_rac_get(c)) {
454         d = vp8_rac_get_uint(c, 4) + 16;
455     } else if (!vp8_rac_get(c)) {
456         d = vp8_rac_get_uint(c, 5) + 32;
457     } else {
458         d = vp8_rac_get_uint(c, 7);
459         if (d >= 65)
460             d = (d << 1) - 65 + vp8_rac_get(c);
461         d += 64;
462     }
463 
464     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
465                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
466 }
467 
decode_frame_header(AVCodecContext * ctx,const uint8_t * data,int size,int * ref)468 static int decode_frame_header(AVCodecContext *ctx,
469                                const uint8_t *data, int size, int *ref)
470 {
471     VP9Context *s = ctx->priv_data;
472     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
473     int last_invisible;
474     const uint8_t *data2;
475 
476     /* general header */
477     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
478         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
479         return res;
480     }
481     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
482         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
483         return AVERROR_INVALIDDATA;
484     }
485     s->profile = get_bits1(&s->gb);
486     if (get_bits1(&s->gb)) { // reserved bit
487         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
488         return AVERROR_INVALIDDATA;
489     }
490     if (get_bits1(&s->gb)) {
491         *ref = get_bits(&s->gb, 3);
492         return 0;
493     }
494     s->last_uses_2pass = s->uses_2pass;
495     s->last_keyframe  = s->keyframe;
496     s->keyframe       = !get_bits1(&s->gb);
497     last_invisible    = s->invisible;
498     s->invisible      = !get_bits1(&s->gb);
499     s->errorres       = get_bits1(&s->gb);
500     s->use_last_frame_mvs = !s->errorres && !last_invisible;
501     if (s->keyframe) {
502         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
503             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
504             return AVERROR_INVALIDDATA;
505         }
506         s->colorspace = get_bits(&s->gb, 3);
507         if (s->colorspace == 7) { // RGB = profile 1
508             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
509             return AVERROR_INVALIDDATA;
510         }
511         s->fullrange  = get_bits1(&s->gb);
512         // for profile 1, here follows the subsampling bits
513         s->refreshrefmask = 0xff;
514         w = get_bits(&s->gb, 16) + 1;
515         h = get_bits(&s->gb, 16) + 1;
516         if (get_bits1(&s->gb)) // display size
517             skip_bits(&s->gb, 32);
518     } else {
519         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
520         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
521         if (s->intraonly) {
522             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
523                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
524                 return AVERROR_INVALIDDATA;
525             }
526             s->refreshrefmask = get_bits(&s->gb, 8);
527             w = get_bits(&s->gb, 16) + 1;
528             h = get_bits(&s->gb, 16) + 1;
529             if (get_bits1(&s->gb)) // display size
530                 skip_bits(&s->gb, 32);
531         } else {
532             s->refreshrefmask = get_bits(&s->gb, 8);
533             s->refidx[0]      = get_bits(&s->gb, 3);
534             s->signbias[0]    = get_bits1(&s->gb);
535             s->refidx[1]      = get_bits(&s->gb, 3);
536             s->signbias[1]    = get_bits1(&s->gb);
537             s->refidx[2]      = get_bits(&s->gb, 3);
538             s->signbias[2]    = get_bits1(&s->gb);
539             if (!s->refs[s->refidx[0]].f->data[0] ||
540                 !s->refs[s->refidx[1]].f->data[0] ||
541                 !s->refs[s->refidx[2]].f->data[0]) {
542                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
543                 return AVERROR_INVALIDDATA;
544             }
545             if (get_bits1(&s->gb)) {
546                 w = s->refs[s->refidx[0]].f->width;
547                 h = s->refs[s->refidx[0]].f->height;
548             } else if (get_bits1(&s->gb)) {
549                 w = s->refs[s->refidx[1]].f->width;
550                 h = s->refs[s->refidx[1]].f->height;
551             } else if (get_bits1(&s->gb)) {
552                 w = s->refs[s->refidx[2]].f->width;
553                 h = s->refs[s->refidx[2]].f->height;
554             } else {
555                 w = get_bits(&s->gb, 16) + 1;
556                 h = get_bits(&s->gb, 16) + 1;
557             }
558             // Note that in this code, "CUR_FRAME" is actually before we
559             // have formally allocated a frame, and thus actually represents
560             // the _last_ frame
561             s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
562                                      s->frames[CUR_FRAME].tf.f->height == h;
563             if (get_bits1(&s->gb)) // display size
564                 skip_bits(&s->gb, 32);
565             s->highprecisionmvs = get_bits1(&s->gb);
566             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
567                                                 get_bits(&s->gb, 2);
568             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
569                                 s->signbias[0] != s->signbias[2];
570             if (s->allowcompinter) {
571                 if (s->signbias[0] == s->signbias[1]) {
572                     s->fixcompref    = 2;
573                     s->varcompref[0] = 0;
574                     s->varcompref[1] = 1;
575                 } else if (s->signbias[0] == s->signbias[2]) {
576                     s->fixcompref    = 1;
577                     s->varcompref[0] = 0;
578                     s->varcompref[1] = 2;
579                 } else {
580                     s->fixcompref    = 0;
581                     s->varcompref[0] = 1;
582                     s->varcompref[1] = 2;
583                 }
584             }
585         }
586     }
587     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
588     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
589     s->framectxid   = c = get_bits(&s->gb, 2);
590 
591     /* loopfilter header data */
592     s->filter.level = get_bits(&s->gb, 6);
593     sharp = get_bits(&s->gb, 3);
594     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
595     // the old cache values since they are still valid
596     if (s->filter.sharpness != sharp)
597         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
598     s->filter.sharpness = sharp;
599     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
600         if (get_bits1(&s->gb)) {
601             for (i = 0; i < 4; i++)
602                 if (get_bits1(&s->gb))
603                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
604             for (i = 0; i < 2; i++)
605                 if (get_bits1(&s->gb))
606                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
607         }
608     } else {
609         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
610     }
611 
612     /* quantization header data */
613     s->yac_qi      = get_bits(&s->gb, 8);
614     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
616     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
617     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
618                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
619 
620     /* segmentation header info */
621     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
622         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
623             for (i = 0; i < 7; i++)
624                 s->prob.seg[i] = get_bits1(&s->gb) ?
625                                  get_bits(&s->gb, 8) : 255;
626             if ((s->segmentation.temporal = get_bits1(&s->gb))) {
627                 for (i = 0; i < 3; i++)
628                     s->prob.segpred[i] = get_bits1(&s->gb) ?
629                                          get_bits(&s->gb, 8) : 255;
630             }
631         }
632         if ((!s->segmentation.update_map || s->segmentation.temporal) &&
633             (w != s->frames[CUR_FRAME].tf.f->width ||
634              h != s->frames[CUR_FRAME].tf.f->height)) {
635             av_log(ctx, AV_LOG_ERROR,
636                    "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
637                    s->segmentation.temporal, s->segmentation.update_map);
638             return AVERROR_INVALIDDATA;
639         }
640 
641         if (get_bits1(&s->gb)) {
642             s->segmentation.absolute_vals = get_bits1(&s->gb);
643             for (i = 0; i < 8; i++) {
644                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
645                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
646                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
647                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
648                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
649                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
650                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
651             }
652         }
653     } else {
654         s->segmentation.feat[0].q_enabled    = 0;
655         s->segmentation.feat[0].lf_enabled   = 0;
656         s->segmentation.feat[0].skip_enabled = 0;
657         s->segmentation.feat[0].ref_enabled  = 0;
658     }
659 
660     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
661     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
662         int qyac, qydc, quvac, quvdc, lflvl, sh;
663 
664         if (s->segmentation.feat[i].q_enabled) {
665             if (s->segmentation.absolute_vals)
666                 qyac = s->segmentation.feat[i].q_val;
667             else
668                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
669         } else {
670             qyac  = s->yac_qi;
671         }
672         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
673         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
674         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
675         qyac  = av_clip_uintp2(qyac, 8);
676 
677         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
678         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
679         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
680         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
681 
682         sh = s->filter.level >= 32;
683         if (s->segmentation.feat[i].lf_enabled) {
684             if (s->segmentation.absolute_vals)
685                 lflvl = s->segmentation.feat[i].lf_val;
686             else
687                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
688         } else {
689             lflvl  = s->filter.level;
690         }
691         s->segmentation.feat[i].lflvl[0][0] =
692         s->segmentation.feat[i].lflvl[0][1] =
693             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
694         for (j = 1; j < 4; j++) {
695             s->segmentation.feat[i].lflvl[j][0] =
696                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
697                                          s->lf_delta.mode[0]) << sh), 6);
698             s->segmentation.feat[i].lflvl[j][1] =
699                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
700                                          s->lf_delta.mode[1]) << sh), 6);
701         }
702     }
703 
704     /* tiling info */
705     if ((res = update_size(ctx, w, h)) < 0) {
706         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
707         return res;
708     }
709     for (s->tiling.log2_tile_cols = 0;
710          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
711          s->tiling.log2_tile_cols++) ;
712     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
713     max = FFMAX(0, max - 1);
714     while (max > s->tiling.log2_tile_cols) {
715         if (get_bits1(&s->gb))
716             s->tiling.log2_tile_cols++;
717         else
718             break;
719     }
720     s->tiling.log2_tile_rows = decode012(&s->gb);
721     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
722     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
723         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
724         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
725                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
726         if (!s->c_b) {
727             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
728             return AVERROR(ENOMEM);
729         }
730     }
731 
732     if (s->keyframe || s->errorres || s->intraonly) {
733         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
734                            s->prob_ctx[3].p = vp9_default_probs;
735         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
736                sizeof(vp9_default_coef_probs));
737         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
738                sizeof(vp9_default_coef_probs));
739         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
740                sizeof(vp9_default_coef_probs));
741         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
742                sizeof(vp9_default_coef_probs));
743     }
744 
745     // next 16 bits is size of the rest of the header (arith-coded)
746     size2 = get_bits(&s->gb, 16);
747     data2 = align_get_bits(&s->gb);
748     if (size2 > size - (data2 - data)) {
749         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
750         return AVERROR_INVALIDDATA;
751     }
752     ff_vp56_init_range_decoder(&s->c, data2, size2);
753     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
754         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
755         return AVERROR_INVALIDDATA;
756     }
757 
758     if (s->keyframe || s->intraonly) {
759         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
760     } else {
761         memset(&s->counts, 0, sizeof(s->counts));
762     }
763     // FIXME is it faster to not copy here, but do it down in the fw updates
764     // as explicit copies if the fw update is missing (and skip the copy upon
765     // fw update)?
766     s->prob.p = s->prob_ctx[c].p;
767 
768     // txfm updates
769     if (s->lossless) {
770         s->txfmmode = TX_4X4;
771     } else {
772         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
773         if (s->txfmmode == 3)
774             s->txfmmode += vp8_rac_get(&s->c);
775 
776         if (s->txfmmode == TX_SWITCHABLE) {
777             for (i = 0; i < 2; i++)
778                 if (vp56_rac_get_prob_branchy(&s->c, 252))
779                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
780             for (i = 0; i < 2; i++)
781                 for (j = 0; j < 2; j++)
782                     if (vp56_rac_get_prob_branchy(&s->c, 252))
783                         s->prob.p.tx16p[i][j] =
784                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
785             for (i = 0; i < 2; i++)
786                 for (j = 0; j < 3; j++)
787                     if (vp56_rac_get_prob_branchy(&s->c, 252))
788                         s->prob.p.tx32p[i][j] =
789                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
790         }
791     }
792 
793     // coef updates
794     for (i = 0; i < 4; i++) {
795         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
796         if (vp8_rac_get(&s->c)) {
797             for (j = 0; j < 2; j++)
798                 for (k = 0; k < 2; k++)
799                     for (l = 0; l < 6; l++)
800                         for (m = 0; m < 6; m++) {
801                             uint8_t *p = s->prob.coef[i][j][k][l][m];
802                             uint8_t *r = ref[j][k][l][m];
803                             if (m >= 3 && l == 0) // dc only has 3 pt
804                                 break;
805                             for (n = 0; n < 3; n++) {
806                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
807                                     p[n] = update_prob(&s->c, r[n]);
808                                 } else {
809                                     p[n] = r[n];
810                                 }
811                             }
812                             p[3] = 0;
813                         }
814         } else {
815             for (j = 0; j < 2; j++)
816                 for (k = 0; k < 2; k++)
817                     for (l = 0; l < 6; l++)
818                         for (m = 0; m < 6; m++) {
819                             uint8_t *p = s->prob.coef[i][j][k][l][m];
820                             uint8_t *r = ref[j][k][l][m];
821                             if (m > 3 && l == 0) // dc only has 3 pt
822                                 break;
823                             memcpy(p, r, 3);
824                             p[3] = 0;
825                         }
826         }
827         if (s->txfmmode == i)
828             break;
829     }
830 
831     // mode updates
832     for (i = 0; i < 3; i++)
833         if (vp56_rac_get_prob_branchy(&s->c, 252))
834             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
835     if (!s->keyframe && !s->intraonly) {
836         for (i = 0; i < 7; i++)
837             for (j = 0; j < 3; j++)
838                 if (vp56_rac_get_prob_branchy(&s->c, 252))
839                     s->prob.p.mv_mode[i][j] =
840                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
841 
842         if (s->filtermode == FILTER_SWITCHABLE)
843             for (i = 0; i < 4; i++)
844                 for (j = 0; j < 2; j++)
845                     if (vp56_rac_get_prob_branchy(&s->c, 252))
846                         s->prob.p.filter[i][j] =
847                             update_prob(&s->c, s->prob.p.filter[i][j]);
848 
849         for (i = 0; i < 4; i++)
850             if (vp56_rac_get_prob_branchy(&s->c, 252))
851                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
852 
853         if (s->allowcompinter) {
854             s->comppredmode = vp8_rac_get(&s->c);
855             if (s->comppredmode)
856                 s->comppredmode += vp8_rac_get(&s->c);
857             if (s->comppredmode == PRED_SWITCHABLE)
858                 for (i = 0; i < 5; i++)
859                     if (vp56_rac_get_prob_branchy(&s->c, 252))
860                         s->prob.p.comp[i] =
861                             update_prob(&s->c, s->prob.p.comp[i]);
862         } else {
863             s->comppredmode = PRED_SINGLEREF;
864         }
865 
866         if (s->comppredmode != PRED_COMPREF) {
867             for (i = 0; i < 5; i++) {
868                 if (vp56_rac_get_prob_branchy(&s->c, 252))
869                     s->prob.p.single_ref[i][0] =
870                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
871                 if (vp56_rac_get_prob_branchy(&s->c, 252))
872                     s->prob.p.single_ref[i][1] =
873                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
874             }
875         }
876 
877         if (s->comppredmode != PRED_SINGLEREF) {
878             for (i = 0; i < 5; i++)
879                 if (vp56_rac_get_prob_branchy(&s->c, 252))
880                     s->prob.p.comp_ref[i] =
881                         update_prob(&s->c, s->prob.p.comp_ref[i]);
882         }
883 
884         for (i = 0; i < 4; i++)
885             for (j = 0; j < 9; j++)
886                 if (vp56_rac_get_prob_branchy(&s->c, 252))
887                     s->prob.p.y_mode[i][j] =
888                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
889 
890         for (i = 0; i < 4; i++)
891             for (j = 0; j < 4; j++)
892                 for (k = 0; k < 3; k++)
893                     if (vp56_rac_get_prob_branchy(&s->c, 252))
894                         s->prob.p.partition[3 - i][j][k] =
895                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
896 
897         // mv fields don't use the update_prob subexp model for some reason
898         for (i = 0; i < 3; i++)
899             if (vp56_rac_get_prob_branchy(&s->c, 252))
900                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
901 
902         for (i = 0; i < 2; i++) {
903             if (vp56_rac_get_prob_branchy(&s->c, 252))
904                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
905 
906             for (j = 0; j < 10; j++)
907                 if (vp56_rac_get_prob_branchy(&s->c, 252))
908                     s->prob.p.mv_comp[i].classes[j] =
909                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
910 
911             if (vp56_rac_get_prob_branchy(&s->c, 252))
912                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
913 
914             for (j = 0; j < 10; j++)
915                 if (vp56_rac_get_prob_branchy(&s->c, 252))
916                     s->prob.p.mv_comp[i].bits[j] =
917                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
918         }
919 
920         for (i = 0; i < 2; i++) {
921             for (j = 0; j < 2; j++)
922                 for (k = 0; k < 3; k++)
923                     if (vp56_rac_get_prob_branchy(&s->c, 252))
924                         s->prob.p.mv_comp[i].class0_fp[j][k] =
925                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
926 
927             for (j = 0; j < 3; j++)
928                 if (vp56_rac_get_prob_branchy(&s->c, 252))
929                     s->prob.p.mv_comp[i].fp[j] =
930                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
931         }
932 
933         if (s->highprecisionmvs) {
934             for (i = 0; i < 2; i++) {
935                 if (vp56_rac_get_prob_branchy(&s->c, 252))
936                     s->prob.p.mv_comp[i].class0_hp =
937                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
938 
939                 if (vp56_rac_get_prob_branchy(&s->c, 252))
940                     s->prob.p.mv_comp[i].hp =
941                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
942             }
943         }
944     }
945 
946     return (data2 - data) + size2;
947 }
948 
clamp_mv(VP56mv * dst,const VP56mv * src,VP9Context * s)949 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
950                                       VP9Context *s)
951 {
952     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
953     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
954 }
955 
find_ref_mvs(VP9Context * s,VP56mv * pmv,int ref,int z,int idx,int sb)956 static void find_ref_mvs(VP9Context *s,
957                          VP56mv *pmv, int ref, int z, int idx, int sb)
958 {
959     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
960 		[BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
961                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
962         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
963                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
964         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
965                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
966         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
967                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
968         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
969                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
970         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
971                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
972         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
973                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
974         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
975                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
976         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
977                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
978         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
979                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
981                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
983                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
985                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
986 	};
987     VP9Block *b = s->b;
988     int row = s->row, col = s->col, row7 = s->row7;
989     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
990 #define INVALID_MV 0x80008000U
991     uint32_t mem = INVALID_MV;
992     int i;
993 
994 #define RETURN_DIRECT_MV(mv) \
995     do { \
996         uint32_t m = AV_RN32A(&mv); \
997         if (!idx) { \
998             AV_WN32A(pmv, m); \
999             return; \
1000         } else if (mem == INVALID_MV) { \
1001             mem = m; \
1002         } else if (m != mem) { \
1003             AV_WN32A(pmv, m); \
1004             return; \
1005         } \
1006     } while (0)
1007 
1008     if (sb >= 0) {
1009         if (sb == 2 || sb == 1) {
1010             RETURN_DIRECT_MV(b->mv[0][z]);
1011         } else if (sb == 3) {
1012             RETURN_DIRECT_MV(b->mv[2][z]);
1013             RETURN_DIRECT_MV(b->mv[1][z]);
1014             RETURN_DIRECT_MV(b->mv[0][z]);
1015         }
1016 
1017 #define RETURN_MV(mv) \
1018     do { \
1019         if (sb > 0) { \
1020             VP56mv tmp; \
1021             uint32_t m; \
1022             clamp_mv(&tmp, &mv, s); \
1023             m = AV_RN32A(&tmp); \
1024             if (!idx) { \
1025                 AV_WN32A(pmv, m); \
1026                 return; \
1027             } else if (mem == INVALID_MV) { \
1028                 mem = m; \
1029             } else if (m != mem) { \
1030                 AV_WN32A(pmv, m); \
1031                 return; \
1032             } \
1033         } else { \
1034             uint32_t m = AV_RN32A(&mv); \
1035             if (!idx) { \
1036                 clamp_mv(pmv, &mv, s); \
1037                 return; \
1038             } else if (mem == INVALID_MV) { \
1039                 mem = m; \
1040             } else if (m != mem) { \
1041                 clamp_mv(pmv, &mv, s); \
1042                 return; \
1043             } \
1044         } \
1045     } while (0)
1046 
1047         if (row > 0) {
1048             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1049             if (mv->ref[0] == ref) {
1050                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1051             } else if (mv->ref[1] == ref) {
1052                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1053             }
1054         }
1055         if (col > s->tiling.tile_col_start) {
1056             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1057             if (mv->ref[0] == ref) {
1058                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1059             } else if (mv->ref[1] == ref) {
1060                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1061             }
1062         }
1063         i = 2;
1064     } else {
1065         i = 0;
1066     }
1067 
1068     // previously coded MVs in this neighbourhood, using same reference frame
1069     for (; i < 8; i++) {
1070         int c = p[i][0] + col, r = p[i][1] + row;
1071 
1072         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1073             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1074 
1075             if (mv->ref[0] == ref) {
1076                 RETURN_MV(mv->mv[0]);
1077             } else if (mv->ref[1] == ref) {
1078                 RETURN_MV(mv->mv[1]);
1079             }
1080         }
1081     }
1082 
1083     // MV at this position in previous frame, using same reference frame
1084     if (s->use_last_frame_mvs) {
1085         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1086 
1087         if (!s->last_uses_2pass)
1088             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1089         if (mv->ref[0] == ref) {
1090             RETURN_MV(mv->mv[0]);
1091         } else if (mv->ref[1] == ref) {
1092             RETURN_MV(mv->mv[1]);
1093         }
1094     }
1095 
1096 #define RETURN_SCALE_MV(mv, scale) \
1097     do { \
1098         if (scale) { \
1099             VP56mv mv_temp = { -mv.x, -mv.y }; \
1100             RETURN_MV(mv_temp); \
1101         } else { \
1102             RETURN_MV(mv); \
1103         } \
1104     } while (0)
1105 
1106     // previously coded MVs in this neighbourhood, using different reference frame
1107     for (i = 0; i < 8; i++) {
1108         int c = p[i][0] + col, r = p[i][1] + row;
1109 
1110         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1111             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1112 
1113             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1114                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1115             }
1116             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1117                 // BUG - libvpx has this condition regardless of whether
1118                 // we used the first ref MV and pre-scaling
1119                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1120                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1121             }
1122         }
1123     }
1124 
1125     // MV at this position in previous frame, using different reference frame
1126     if (s->use_last_frame_mvs) {
1127         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1128 
1129         // no need to await_progress, because we already did that above
1130         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1131             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1132         }
1133         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1134             // BUG - libvpx has this condition regardless of whether
1135             // we used the first ref MV and pre-scaling
1136             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1137             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1138         }
1139     }
1140 
1141     AV_ZERO32(pmv);
1142 #undef INVALID_MV
1143 #undef RETURN_MV
1144 #undef RETURN_SCALE_MV
1145 }
1146 
read_mv_component(VP9Context * s,int idx,int hp)1147 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1148 {
1149     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1150     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1151                                 s->prob.p.mv_comp[idx].classes);
1152 
1153     s->counts.mv_comp[idx].sign[sign]++;
1154     s->counts.mv_comp[idx].classes[c]++;
1155     if (c) {
1156         int m;
1157 
1158         for (n = 0, m = 0; m < c; m++) {
1159             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1160             n |= bit << m;
1161             s->counts.mv_comp[idx].bits[m][bit]++;
1162         }
1163         n <<= 3;
1164         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1165         n |= bit << 1;
1166         s->counts.mv_comp[idx].fp[bit]++;
1167         if (hp) {
1168             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1169             s->counts.mv_comp[idx].hp[bit]++;
1170             n |= bit;
1171         } else {
1172             n |= 1;
1173             // bug in libvpx - we count for bw entropy purposes even if the
1174             // bit wasn't coded
1175             s->counts.mv_comp[idx].hp[1]++;
1176         }
1177         n += 8 << c;
1178     } else {
1179         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1180         s->counts.mv_comp[idx].class0[n]++;
1181         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1182                                s->prob.p.mv_comp[idx].class0_fp[n]);
1183         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1184         n = (n << 3) | (bit << 1);
1185         if (hp) {
1186             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1187             s->counts.mv_comp[idx].class0_hp[bit]++;
1188             n |= bit;
1189         } else {
1190             n |= 1;
1191             // bug in libvpx - we count for bw entropy purposes even if the
1192             // bit wasn't coded
1193             s->counts.mv_comp[idx].class0_hp[1]++;
1194         }
1195     }
1196 
1197     return sign ? -(n + 1) : (n + 1);
1198 }
1199 
fill_mv(VP9Context * s,VP56mv * mv,int mode,int sb)1200 static void fill_mv(VP9Context *s,
1201                     VP56mv *mv, int mode, int sb)
1202 {
1203     VP9Block *b = s->b;
1204 
1205     if (mode == ZEROMV) {
1206         AV_ZERO64(mv);
1207     } else {
1208         int hp = 0;
1209 
1210         // FIXME cache this value and reuse for other subblocks
1211         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1212                      mode == NEWMV ? -1 : sb);
1213         // FIXME maybe move this code into find_ref_mvs()
1214         if ((mode == NEWMV || sb == -1) &&
1215             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1216             if (mv[0].y & 1) {
1217                 if (mv[0].y < 0)
1218                     mv[0].y++;
1219                 else
1220                     mv[0].y--;
1221             }
1222             if (mv[0].x & 1) {
1223                 if (mv[0].x < 0)
1224                     mv[0].x++;
1225                 else
1226                     mv[0].x--;
1227             }
1228         }
1229         if (mode == NEWMV) {
1230             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1231                                               s->prob.p.mv_joint);
1232 
1233             s->counts.mv_joint[j]++;
1234             if (j >= MV_JOINT_V)
1235                 mv[0].y += read_mv_component(s, 0, hp);
1236             if (j & 1)
1237                 mv[0].x += read_mv_component(s, 1, hp);
1238         }
1239 
1240         if (b->comp) {
1241             // FIXME cache this value and reuse for other subblocks
1242             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1243                          mode == NEWMV ? -1 : sb);
1244             if ((mode == NEWMV || sb == -1) &&
1245                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1246                 if (mv[1].y & 1) {
1247                     if (mv[1].y < 0)
1248                         mv[1].y++;
1249                     else
1250                         mv[1].y--;
1251                 }
1252                 if (mv[1].x & 1) {
1253                     if (mv[1].x < 0)
1254                         mv[1].x++;
1255                     else
1256                         mv[1].x--;
1257                 }
1258             }
1259             if (mode == NEWMV) {
1260                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1261                                                   s->prob.p.mv_joint);
1262 
1263                 s->counts.mv_joint[j]++;
1264                 if (j >= MV_JOINT_V)
1265                     mv[1].y += read_mv_component(s, 0, hp);
1266                 if (j & 1)
1267                     mv[1].x += read_mv_component(s, 1, hp);
1268             }
1269         }
1270     }
1271 }
1272 
setctx_2d(uint8_t * ptr,int w,int h,ptrdiff_t stride,int v)1273 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1274                                        ptrdiff_t stride, int v)
1275 {
1276     switch (w) {
1277     case 1:
1278         do {
1279             *ptr = v;
1280             ptr += stride;
1281         } while (--h);
1282         break;
1283     case 2: {
1284         int v16 = v * 0x0101;
1285         do {
1286             AV_WN16A(ptr, v16);
1287             ptr += stride;
1288         } while (--h);
1289         break;
1290     }
1291     case 4: {
1292         uint32_t v32 = v * 0x01010101;
1293         do {
1294             AV_WN32A(ptr, v32);
1295             ptr += stride;
1296         } while (--h);
1297         break;
1298     }
1299     case 8: {
1300 #if HAVE_FAST_64BIT
1301         uint64_t v64 = v * 0x0101010101010101ULL;
1302         do {
1303             AV_WN64A(ptr, v64);
1304             ptr += stride;
1305         } while (--h);
1306 #else
1307         uint32_t v32 = v * 0x01010101;
1308         do {
1309             AV_WN32A(ptr,     v32);
1310             AV_WN32A(ptr + 4, v32);
1311             ptr += stride;
1312         } while (--h);
1313 #endif
1314         break;
1315     }
1316     }
1317 }
1318 
decode_mode(AVCodecContext * ctx)1319 static void decode_mode(AVCodecContext *ctx)
1320 {
1321     static const uint8_t left_ctx[N_BS_SIZES] = {
1322         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1323     };
1324     static const uint8_t above_ctx[N_BS_SIZES] = {
1325         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1326     };
1327     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1328         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1329         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1330     };
1331     VP9Context *s = ctx->priv_data;
1332     VP9Block *b = s->b;
1333     int row = s->row, col = s->col, row7 = s->row7;
1334     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1335     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1336     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1337     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1338     int vref = 0;
1339     int filter_id = 0;
1340 
1341     if (!s->segmentation.enabled) {
1342         b->seg_id = 0;
1343     } else if (s->keyframe || s->intraonly) {
1344         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1345     } else if (!s->segmentation.update_map ||
1346                (s->segmentation.temporal &&
1347                 vp56_rac_get_prob_branchy(&s->c,
1348                     s->prob.segpred[s->above_segpred_ctx[col] +
1349                                     s->left_segpred_ctx[row7]]))) {
1350         if (!s->errorres) {
1351             int pred = 8, x;
1352             uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1353 
1354             if (!s->last_uses_2pass)
1355                 ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1356             for (y = 0; y < h4; y++)
1357                 for (x = 0; x < w4; x++)
1358                     pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1359             av_assert1(pred < 8);
1360             b->seg_id = pred;
1361         } else {
1362             b->seg_id = 0;
1363         }
1364 
1365         memset(&s->above_segpred_ctx[col], 1, w4);
1366         memset(&s->left_segpred_ctx[row7], 1, h4);
1367     } else {
1368         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1369                                      s->prob.seg);
1370 
1371         memset(&s->above_segpred_ctx[col], 0, w4);
1372         memset(&s->left_segpred_ctx[row7], 0, h4);
1373     }
1374     if (s->segmentation.enabled &&
1375         (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1376         setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1377                   w4, h4, 8 * s->sb_cols, b->seg_id);
1378     }
1379 
1380     b->skip = s->segmentation.enabled &&
1381         s->segmentation.feat[b->seg_id].skip_enabled;
1382     if (!b->skip) {
1383         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1384         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1385         s->counts.skip[c][b->skip]++;
1386     }
1387 
1388     if (s->keyframe || s->intraonly) {
1389         b->intra = 1;
1390     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1391         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1392     } else {
1393         int c, bit;
1394 
1395         if (have_a && have_l) {
1396             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1397             c += (c == 2);
1398         } else {
1399             c = have_a ? 2 * s->above_intra_ctx[col] :
1400                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1401         }
1402         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1403         s->counts.intra[c][bit]++;
1404         b->intra = !bit;
1405     }
1406 
1407     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1408         int c;
1409         if (have_a) {
1410             if (have_l) {
1411                 c = (s->above_skip_ctx[col] ? max_tx :
1412                      s->above_txfm_ctx[col]) +
1413                     (s->left_skip_ctx[row7] ? max_tx :
1414                      s->left_txfm_ctx[row7]) > max_tx;
1415             } else {
1416                 c = s->above_skip_ctx[col] ? 1 :
1417                     (s->above_txfm_ctx[col] * 2 > max_tx);
1418             }
1419         } else if (have_l) {
1420             c = s->left_skip_ctx[row7] ? 1 :
1421                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1422         } else {
1423             c = 1;
1424         }
1425         switch (max_tx) {
1426         case TX_32X32:
1427             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1428             if (b->tx) {
1429                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1430                 if (b->tx == 2)
1431                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1432             }
1433             s->counts.tx32p[c][b->tx]++;
1434             break;
1435         case TX_16X16:
1436             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1437             if (b->tx)
1438                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1439             s->counts.tx16p[c][b->tx]++;
1440             break;
1441         case TX_8X8:
1442             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1443             s->counts.tx8p[c][b->tx]++;
1444             break;
1445         case TX_4X4:
1446             b->tx = TX_4X4;
1447             break;
1448         }
1449     } else {
1450         b->tx = FFMIN(max_tx, s->txfmmode);
1451     }
1452 
1453     if (s->keyframe || s->intraonly) {
1454         uint8_t *a = &s->above_mode_ctx[col * 2];
1455         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1456 
1457         b->comp = 0;
1458         if (b->bs > BS_8x8) {
1459             // FIXME the memory storage intermediates here aren't really
1460             // necessary, they're just there to make the code slightly
1461             // simpler for now
1462             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1463                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1464             if (b->bs != BS_8x4) {
1465                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1467                 l[0] = a[1] = b->mode[1];
1468             } else {
1469                 l[0] = a[1] = b->mode[1] = b->mode[0];
1470             }
1471             if (b->bs != BS_4x8) {
1472                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1473                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1474                 if (b->bs != BS_8x4) {
1475                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1476                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1477                     l[1] = a[1] = b->mode[3];
1478                 } else {
1479                     l[1] = a[1] = b->mode[3] = b->mode[2];
1480                 }
1481             } else {
1482                 b->mode[2] = b->mode[0];
1483                 l[1] = a[1] = b->mode[3] = b->mode[1];
1484             }
1485         } else {
1486             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1487                                           vp9_default_kf_ymode_probs[*a][*l]);
1488             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1489             // FIXME this can probably be optimized
1490             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1491             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1492         }
1493         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1494                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1495     } else if (b->intra) {
1496         b->comp = 0;
1497         if (b->bs > BS_8x8) {
1498             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1499                                           s->prob.p.y_mode[0]);
1500             s->counts.y_mode[0][b->mode[0]]++;
1501             if (b->bs != BS_8x4) {
1502                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1503                                               s->prob.p.y_mode[0]);
1504                 s->counts.y_mode[0][b->mode[1]]++;
1505             } else {
1506                 b->mode[1] = b->mode[0];
1507             }
1508             if (b->bs != BS_4x8) {
1509                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1510                                               s->prob.p.y_mode[0]);
1511                 s->counts.y_mode[0][b->mode[2]]++;
1512                 if (b->bs != BS_8x4) {
1513                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1514                                                   s->prob.p.y_mode[0]);
1515                     s->counts.y_mode[0][b->mode[3]]++;
1516                 } else {
1517                     b->mode[3] = b->mode[2];
1518                 }
1519             } else {
1520                 b->mode[2] = b->mode[0];
1521                 b->mode[3] = b->mode[1];
1522             }
1523         } else {
1524             static const uint8_t size_group[10] = {
1525                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1526             };
1527             int sz = size_group[b->bs];
1528 
1529             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1530                                           s->prob.p.y_mode[sz]);
1531             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1532             s->counts.y_mode[sz][b->mode[3]]++;
1533         }
1534         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1535                                      s->prob.p.uv_mode[b->mode[3]]);
1536         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1537     } else {
1538         static const uint8_t inter_mode_ctx_lut[14][14] = {
1539             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1543             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1544             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1545             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1546             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1547             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1548             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1549             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1550             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1551             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1552             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1553         };
1554 
1555         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1556             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1557             b->comp = 0;
1558             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1559         } else {
1560             // read comp_pred flag
1561             if (s->comppredmode != PRED_SWITCHABLE) {
1562                 b->comp = s->comppredmode == PRED_COMPREF;
1563             } else {
1564                 int c;
1565 
1566                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1567                 if (have_a) {
1568                     if (have_l) {
1569                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1570                             c = 4;
1571                         } else if (s->above_comp_ctx[col]) {
1572                             c = 2 + (s->left_intra_ctx[row7] ||
1573                                      s->left_ref_ctx[row7] == s->fixcompref);
1574                         } else if (s->left_comp_ctx[row7]) {
1575                             c = 2 + (s->above_intra_ctx[col] ||
1576                                      s->above_ref_ctx[col] == s->fixcompref);
1577                         } else {
1578                             c = (!s->above_intra_ctx[col] &&
1579                                  s->above_ref_ctx[col] == s->fixcompref) ^
1580                             (!s->left_intra_ctx[row7] &&
1581                              s->left_ref_ctx[row & 7] == s->fixcompref);
1582                         }
1583                     } else {
1584                         c = s->above_comp_ctx[col] ? 3 :
1585                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1586                     }
1587                 } else if (have_l) {
1588                     c = s->left_comp_ctx[row7] ? 3 :
1589                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1590                 } else {
1591                     c = 1;
1592                 }
1593                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1594                 s->counts.comp[c][b->comp]++;
1595             }
1596 
1597             // read actual references
1598             // FIXME probably cache a few variables here to prevent repetitive
1599             // memory accesses below
1600             if (b->comp) /* two references */ {
1601                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1602 
1603                 b->ref[fix_idx] = s->fixcompref;
1604                 // FIXME can this codeblob be replaced by some sort of LUT?
1605                 if (have_a) {
1606                     if (have_l) {
1607                         if (s->above_intra_ctx[col]) {
1608                             if (s->left_intra_ctx[row7]) {
1609                                 c = 2;
1610                             } else {
1611                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1612                             }
1613                         } else if (s->left_intra_ctx[row7]) {
1614                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1615                         } else {
1616                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1617 
1618                             if (refl == refa && refa == s->varcompref[1]) {
1619                                 c = 0;
1620                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1621                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1622                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1623                                     c = 4;
1624                                 } else {
1625                                     c = (refa == refl) ? 3 : 1;
1626                                 }
1627                             } else if (!s->left_comp_ctx[row7]) {
1628                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1629                                     c = 1;
1630                                 } else {
1631                                     c = (refl == s->varcompref[1] &&
1632                                          refa != s->varcompref[1]) ? 2 : 4;
1633                                 }
1634                             } else if (!s->above_comp_ctx[col]) {
1635                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1636                                     c = 1;
1637                                 } else {
1638                                     c = (refa == s->varcompref[1] &&
1639                                          refl != s->varcompref[1]) ? 2 : 4;
1640                                 }
1641                             } else {
1642                                 c = (refl == refa) ? 4 : 2;
1643                             }
1644                         }
1645                     } else {
1646                         if (s->above_intra_ctx[col]) {
1647                             c = 2;
1648                         } else if (s->above_comp_ctx[col]) {
1649                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1650                         } else {
1651                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1652                         }
1653                     }
1654                 } else if (have_l) {
1655                     if (s->left_intra_ctx[row7]) {
1656                         c = 2;
1657                     } else if (s->left_comp_ctx[row7]) {
1658                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1659                     } else {
1660                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1661                     }
1662                 } else {
1663                     c = 2;
1664                 }
1665                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1666                 b->ref[var_idx] = s->varcompref[bit];
1667                 s->counts.comp_ref[c][bit]++;
1668             } else /* single reference */ {
1669                 int bit, c;
1670 
1671                 if (have_a && !s->above_intra_ctx[col]) {
1672                     if (have_l && !s->left_intra_ctx[row7]) {
1673                         if (s->left_comp_ctx[row7]) {
1674                             if (s->above_comp_ctx[col]) {
1675                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1676                                          !s->above_ref_ctx[col]);
1677                             } else {
1678                                 c = (3 * !s->above_ref_ctx[col]) +
1679                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1680                             }
1681                         } else if (s->above_comp_ctx[col]) {
1682                             c = (3 * !s->left_ref_ctx[row7]) +
1683                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1684                         } else {
1685                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1686                         }
1687                     } else if (s->above_intra_ctx[col]) {
1688                         c = 2;
1689                     } else if (s->above_comp_ctx[col]) {
1690                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1691                     } else {
1692                         c = 4 * (!s->above_ref_ctx[col]);
1693                     }
1694                 } else if (have_l && !s->left_intra_ctx[row7]) {
1695                     if (s->left_intra_ctx[row7]) {
1696                         c = 2;
1697                     } else if (s->left_comp_ctx[row7]) {
1698                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1699                     } else {
1700                         c = 4 * (!s->left_ref_ctx[row7]);
1701                     }
1702                 } else {
1703                     c = 2;
1704                 }
1705                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1706                 s->counts.single_ref[c][0][bit]++;
1707                 if (!bit) {
1708                     b->ref[0] = 0;
1709                 } else {
1710                     // FIXME can this codeblob be replaced by some sort of LUT?
1711                     if (have_a) {
1712                         if (have_l) {
1713                             if (s->left_intra_ctx[row7]) {
1714                                 if (s->above_intra_ctx[col]) {
1715                                     c = 2;
1716                                 } else if (s->above_comp_ctx[col]) {
1717                                     c = 1 + 2 * (s->fixcompref == 1 ||
1718                                                  s->above_ref_ctx[col] == 1);
1719                                 } else if (!s->above_ref_ctx[col]) {
1720                                     c = 3;
1721                                 } else {
1722                                     c = 4 * (s->above_ref_ctx[col] == 1);
1723                                 }
1724                             } else if (s->above_intra_ctx[col]) {
1725                                 if (s->left_intra_ctx[row7]) {
1726                                     c = 2;
1727                                 } else if (s->left_comp_ctx[row7]) {
1728                                     c = 1 + 2 * (s->fixcompref == 1 ||
1729                                                  s->left_ref_ctx[row7] == 1);
1730                                 } else if (!s->left_ref_ctx[row7]) {
1731                                     c = 3;
1732                                 } else {
1733                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1734                                 }
1735                             } else if (s->above_comp_ctx[col]) {
1736                                 if (s->left_comp_ctx[row7]) {
1737                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1738                                         c = 3 * (s->fixcompref == 1 ||
1739                                                  s->left_ref_ctx[row7] == 1);
1740                                     } else {
1741                                         c = 2;
1742                                     }
1743                                 } else if (!s->left_ref_ctx[row7]) {
1744                                     c = 1 + 2 * (s->fixcompref == 1 ||
1745                                                  s->above_ref_ctx[col] == 1);
1746                                 } else {
1747                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1748                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1749                                 }
1750                             } else if (s->left_comp_ctx[row7]) {
1751                                 if (!s->above_ref_ctx[col]) {
1752                                     c = 1 + 2 * (s->fixcompref == 1 ||
1753                                                  s->left_ref_ctx[row7] == 1);
1754                                 } else {
1755                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1756                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1757                                 }
1758                             } else if (!s->above_ref_ctx[col]) {
1759                                 if (!s->left_ref_ctx[row7]) {
1760                                     c = 3;
1761                                 } else {
1762                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1763                                 }
1764                             } else if (!s->left_ref_ctx[row7]) {
1765                                 c = 4 * (s->above_ref_ctx[col] == 1);
1766                             } else {
1767                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1768                                 2 * (s->above_ref_ctx[col] == 1);
1769                             }
1770                         } else {
1771                             if (s->above_intra_ctx[col] ||
1772                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1773                                 c = 2;
1774                             } else if (s->above_comp_ctx[col]) {
1775                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1776                             } else {
1777                                 c = 4 * (s->above_ref_ctx[col] == 1);
1778                             }
1779                         }
1780                     } else if (have_l) {
1781                         if (s->left_intra_ctx[row7] ||
1782                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1783                             c = 2;
1784                         } else if (s->left_comp_ctx[row7]) {
1785                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1786                         } else {
1787                             c = 4 * (s->left_ref_ctx[row7] == 1);
1788                         }
1789                     } else {
1790                         c = 2;
1791                     }
1792                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1793                     s->counts.single_ref[c][1][bit]++;
1794                     b->ref[0] = 1 + bit;
1795                 }
1796             }
1797         }
1798 
1799         if (b->bs <= BS_8x8) {
1800             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1801                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1802             } else {
1803                 static const uint8_t off[10] = {
1804                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1805                 };
1806 
1807                 // FIXME this needs to use the LUT tables from find_ref_mvs
1808                 // because not all are -1,0/0,-1
1809                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1810                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1811 
1812                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1813                                               s->prob.p.mv_mode[c]);
1814                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1815                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1816             }
1817         }
1818 
1819         if (s->filtermode == FILTER_SWITCHABLE) {
1820             int c;
1821 
1822             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1823                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1824                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1825                         s->left_filter_ctx[row7] : 3;
1826                 } else {
1827                     c = s->above_filter_ctx[col];
1828                 }
1829             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1830                 c = s->left_filter_ctx[row7];
1831             } else {
1832                 c = 3;
1833             }
1834 
1835             filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1836                                          s->prob.p.filter[c]);
1837             s->counts.filter[c][filter_id]++;
1838             b->filter = vp9_filter_lut[filter_id];
1839         } else {
1840             b->filter = s->filtermode;
1841         }
1842 
1843         if (b->bs > BS_8x8) {
1844             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1845 
1846             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1847                                           s->prob.p.mv_mode[c]);
1848             s->counts.mv_mode[c][b->mode[0] - 10]++;
1849             fill_mv(s, b->mv[0], b->mode[0], 0);
1850 
1851             if (b->bs != BS_8x4) {
1852                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1853                                               s->prob.p.mv_mode[c]);
1854                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1855                 fill_mv(s, b->mv[1], b->mode[1], 1);
1856             } else {
1857                 b->mode[1] = b->mode[0];
1858                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1859                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1860             }
1861 
1862             if (b->bs != BS_4x8) {
1863                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1864                                               s->prob.p.mv_mode[c]);
1865                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1866                 fill_mv(s, b->mv[2], b->mode[2], 2);
1867 
1868                 if (b->bs != BS_8x4) {
1869                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1870                                                   s->prob.p.mv_mode[c]);
1871                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1872                     fill_mv(s, b->mv[3], b->mode[3], 3);
1873                 } else {
1874                     b->mode[3] = b->mode[2];
1875                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1876                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1877                 }
1878             } else {
1879                 b->mode[2] = b->mode[0];
1880                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1881                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1882                 b->mode[3] = b->mode[1];
1883                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1884                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1885             }
1886         } else {
1887             fill_mv(s, b->mv[0], b->mode[0], -1);
1888             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1889             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1890             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1891             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1892             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1893             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1894         }
1895 
1896         vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1897     }
1898 
1899 #if HAVE_FAST_64BIT
1900 #define SPLAT_CTX(var, val, n) \
1901     switch (n) { \
1902     case 1:  var = val;                                    break; \
1903     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
1904     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
1905     case 8:  AV_WN64A(&var, val * ULLN(0x0101010101010101));  break; \
1906     case 16: { \
1907         uint64_t v64 = val * ULLN(0x0101010101010101); \
1908         AV_WN64A(              &var,     v64); \
1909         AV_WN64A(&((uint8_t *) &var)[8], v64); \
1910         break; \
1911     } \
1912     }
1913 #else
1914 #define SPLAT_CTX(var, val, n) \
1915     switch (n) { \
1916     case 1:  var = val;                         break; \
1917     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
1918     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
1919     case 8: { \
1920         uint32_t v32 = val * 0x01010101; \
1921         AV_WN32A(              &var,     v32); \
1922         AV_WN32A(&((uint8_t *) &var)[4], v32); \
1923         break; \
1924     } \
1925     case 16: { \
1926         uint32_t v32 = val * 0x01010101; \
1927         AV_WN32A(              &var,      v32); \
1928         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
1929         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
1930         AV_WN32A(&((uint8_t *) &var)[12], v32); \
1931         break; \
1932     } \
1933     }
1934 #endif
1935 
1936     switch (bwh_tab[1][b->bs][0]) {
1937 #define SET_CTXS(dir, off, n) \
1938     do { \
1939         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
1940         SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
1941         SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1942         if (!s->keyframe && !s->intraonly) { \
1943             SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
1944             SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
1945             SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
1946             if (!b->intra) { \
1947                 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1948                 if (s->filtermode == FILTER_SWITCHABLE) { \
1949                     SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1950                 } \
1951             } \
1952         } \
1953     } while (0)
1954     case 1: SET_CTXS(above, col, 1); break;
1955     case 2: SET_CTXS(above, col, 2); break;
1956     case 4: SET_CTXS(above, col, 4); break;
1957     case 8: SET_CTXS(above, col, 8); break;
1958     }
1959     switch (bwh_tab[1][b->bs][1]) {
1960     case 1: SET_CTXS(left, row7, 1); break;
1961     case 2: SET_CTXS(left, row7, 2); break;
1962     case 4: SET_CTXS(left, row7, 4); break;
1963     case 8: SET_CTXS(left, row7, 8); break;
1964     }
1965 #undef SPLAT_CTX
1966 #undef SET_CTXS
1967 
1968     if (!s->keyframe && !s->intraonly) {
1969         if (b->bs > BS_8x8) {
1970             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1971 
1972             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1973             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1974             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1975             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1976             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1977             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1978             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1979             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1980         } else {
1981             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1982 
1983             for (n = 0; n < w4 * 2; n++) {
1984                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1985                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1986             }
1987             for (n = 0; n < h4 * 2; n++) {
1988                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1989                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1990             }
1991         }
1992     }
1993 
1994     // FIXME kinda ugly
1995     for (y = 0; y < h4; y++) {
1996         int x, o = (row + y) * s->sb_cols * 8 + col;
1997         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1998 
1999         if (b->intra) {
2000             for (x = 0; x < w4; x++) {
2001                 mv[x].ref[0] =
2002                 mv[x].ref[1] = -1;
2003             }
2004         } else if (b->comp) {
2005             for (x = 0; x < w4; x++) {
2006                 mv[x].ref[0] = b->ref[0];
2007                 mv[x].ref[1] = b->ref[1];
2008                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2009                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2010             }
2011         } else {
2012             for (x = 0; x < w4; x++) {
2013                 mv[x].ref[0] = b->ref[0];
2014                 mv[x].ref[1] = -1;
2015                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2016             }
2017         }
2018     }
2019 }
2020 
2021 // FIXME merge cnt/eob arguments?
2022 static av_always_inline int
decode_coeffs_b_generic(VP56RangeCoder * c,int16_t * coef,int n_coeffs,int is_tx32x32,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,const int16_t * qmul)2023 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2024                         int is_tx32x32, unsigned (*cnt)[6][3],
2025                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2026                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
2027                         const int16_t *band_counts, const int16_t *qmul)
2028 {
2029     int i = 0, band = 0, band_left = band_counts[band];
2030     uint8_t *tp = p[0][nnz];
2031     uint8_t cache[1024];
2032 
2033     do {
2034         int val, rc;
2035 
2036         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2037         eob[band][nnz][val]++;
2038         if (!val)
2039             break;
2040 
2041     skip_eob:
2042         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2043             cnt[band][nnz][0]++;
2044             if (!--band_left)
2045                 band_left = band_counts[++band];
2046             cache[scan[i]] = 0;
2047             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2048             tp = p[band][nnz];
2049             if (++i == n_coeffs)
2050                 break; //invalid input; blocks should end with EOB
2051             goto skip_eob;
2052         }
2053 
2054         rc = scan[i];
2055         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2056             cnt[band][nnz][1]++;
2057             val = 1;
2058             cache[rc] = 1;
2059         } else {
2060             // fill in p[3-10] (model fill) - only once per frame for each pos
2061             if (!tp[3])
2062                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2063 
2064             cnt[band][nnz][2]++;
2065             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2066                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2067                     cache[rc] = val = 2;
2068                 } else {
2069                     val = 3 + vp56_rac_get_prob(c, tp[5]);
2070                     cache[rc] = 3;
2071                 }
2072             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2073                 cache[rc] = 4;
2074                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2075                     val = 5 + vp56_rac_get_prob(c, 159);
2076                 } else {
2077                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
2078                     val +=      vp56_rac_get_prob(c, 145);
2079                 }
2080             } else { // cat 3-6
2081                 cache[rc] = 5;
2082                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2083                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2084                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
2085                         val +=      (vp56_rac_get_prob(c, 148) << 1);
2086                         val +=       vp56_rac_get_prob(c, 140);
2087                     } else {
2088                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
2089                         val +=      (vp56_rac_get_prob(c, 155) << 2);
2090                         val +=      (vp56_rac_get_prob(c, 140) << 1);
2091                         val +=       vp56_rac_get_prob(c, 135);
2092                     }
2093                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2094                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
2095                     val +=      (vp56_rac_get_prob(c, 157) << 3);
2096                     val +=      (vp56_rac_get_prob(c, 141) << 2);
2097                     val +=      (vp56_rac_get_prob(c, 134) << 1);
2098                     val +=       vp56_rac_get_prob(c, 130);
2099                 } else {
2100                     val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
2101                     val +=      (vp56_rac_get_prob(c, 254) << 12);
2102                     val +=      (vp56_rac_get_prob(c, 254) << 11);
2103                     val +=      (vp56_rac_get_prob(c, 252) << 10);
2104                     val +=      (vp56_rac_get_prob(c, 249) << 9);
2105                     val +=      (vp56_rac_get_prob(c, 243) << 8);
2106                     val +=      (vp56_rac_get_prob(c, 230) << 7);
2107                     val +=      (vp56_rac_get_prob(c, 196) << 6);
2108                     val +=      (vp56_rac_get_prob(c, 177) << 5);
2109                     val +=      (vp56_rac_get_prob(c, 153) << 4);
2110                     val +=      (vp56_rac_get_prob(c, 140) << 3);
2111                     val +=      (vp56_rac_get_prob(c, 133) << 2);
2112                     val +=      (vp56_rac_get_prob(c, 130) << 1);
2113                     val +=       vp56_rac_get_prob(c, 129);
2114                 }
2115             }
2116         }
2117         if (!--band_left)
2118             band_left = band_counts[++band];
2119         if (is_tx32x32)
2120             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2121         else
2122             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2123         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2124         tp = p[band][nnz];
2125     } while (++i < n_coeffs);
2126 
2127     return i;
2128 }
2129 
decode_coeffs_b(VP56RangeCoder * c,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,const int16_t * qmul)2130 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2131                            unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2132                            uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2133                            const int16_t (*nb)[2], const int16_t *band_counts,
2134                            const int16_t *qmul)
2135 {
2136     return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2137                                    nnz, scan, nb, band_counts, qmul);
2138 }
2139 
decode_coeffs_b32(VP56RangeCoder * c,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,const int16_t * qmul)2140 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2141                              unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2142                              uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2143                              const int16_t (*nb)[2], const int16_t *band_counts,
2144                              const int16_t *qmul)
2145 {
2146     return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2147                                    nnz, scan, nb, band_counts, qmul);
2148 }
2149 
decode_coeffs(AVCodecContext * ctx)2150 static void decode_coeffs(AVCodecContext *ctx)
2151 {
2152     VP9Context *s = ctx->priv_data;
2153     VP9Block *b = s->b;
2154     int row = s->row, col = s->col;
2155     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2156     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2157     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2158     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2159     int end_x = FFMIN(2 * (s->cols - col), w4);
2160     int end_y = FFMIN(2 * (s->rows - row), h4);
2161     int n, pl, x, y, res;
2162     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2163     int tx = 4 * s->lossless + b->tx;
2164     const int16_t * const *yscans = vp9_scans[tx];
2165     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2166     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2167     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2168     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2169     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2170     static const int16_t band_counts[4][8] = {
2171         { 1, 2, 3, 4,  3,   16 - 13 },
2172         { 1, 2, 3, 4, 11,   64 - 21 },
2173         { 1, 2, 3, 4, 11,  256 - 21 },
2174         { 1, 2, 3, 4, 11, 1024 - 21 },
2175     };
2176     const int16_t *y_band_counts = band_counts[b->tx];
2177     const int16_t *uv_band_counts = band_counts[b->uvtx];
2178 
2179 #define MERGE(la, end, step, rd) \
2180     for (n = 0; n < end; n += step) \
2181         la[n] = !!rd(&la[n])
2182 #define MERGE_CTX(step, rd) \
2183     do { \
2184         MERGE(l, end_y, step, rd); \
2185         MERGE(a, end_x, step, rd); \
2186     } while (0)
2187 
2188 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2189     for (n = 0, y = 0; y < end_y; y += step) { \
2190         for (x = 0; x < end_x; x += step, n += step * step) { \
2191             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2192             res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2193                                      c, e, p, a[x] + l[y], yscans[txtp], \
2194                                      ynbs[txtp], y_band_counts, qmul[0]); \
2195             a[x] = l[y] = !!res; \
2196             if (step >= 4) { \
2197                 AV_WN16A(&s->eob[n], res); \
2198             } else { \
2199                 s->eob[n] = res; \
2200             } \
2201         } \
2202     }
2203 
2204 #define SPLAT(la, end, step, cond) \
2205     if (step == 2) { \
2206         for (n = 1; n < end; n += step) \
2207             la[n] = la[n - 1]; \
2208     } else if (step == 4) { \
2209         if (cond) { \
2210             for (n = 0; n < end; n += step) \
2211                 AV_WN32A(&la[n], la[n] * 0x01010101); \
2212         } else { \
2213             for (n = 0; n < end; n += step) \
2214                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2215         } \
2216     } else /* step == 8 */ { \
2217         if (cond) { \
2218             if (HAVE_FAST_64BIT) { \
2219                 for (n = 0; n < end; n += step) \
2220                     AV_WN64A(&la[n], la[n] * ULLN(0x0101010101010101)); \
2221             } else { \
2222                 for (n = 0; n < end; n += step) { \
2223                     uint32_t v32 = la[n] * 0x01010101; \
2224                     AV_WN32A(&la[n],     v32); \
2225                     AV_WN32A(&la[n + 4], v32); \
2226                 } \
2227             } \
2228         } else { \
2229             for (n = 0; n < end; n += step) \
2230                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2231         } \
2232     }
2233 #define SPLAT_CTX(step) \
2234     do { \
2235         SPLAT(a, end_x, step, end_x == w4); \
2236         SPLAT(l, end_y, step, end_y == h4); \
2237     } while (0)
2238 
2239     /* y tokens */
2240     switch (b->tx) {
2241     case TX_4X4:
2242         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2243         break;
2244     case TX_8X8:
2245         MERGE_CTX(2, AV_RN16A);
2246         DECODE_Y_COEF_LOOP(2, 0,);
2247         SPLAT_CTX(2);
2248         break;
2249     case TX_16X16:
2250         MERGE_CTX(4, AV_RN32A);
2251         DECODE_Y_COEF_LOOP(4, 0,);
2252         SPLAT_CTX(4);
2253         break;
2254     case TX_32X32:
2255         MERGE_CTX(8, AV_RN64A);
2256         DECODE_Y_COEF_LOOP(8, 0, 32);
2257         SPLAT_CTX(8);
2258         break;
2259     }
2260 
2261 #define DECODE_UV_COEF_LOOP(step) \
2262     for (n = 0, y = 0; y < end_y; y += step) { \
2263         for (x = 0; x < end_x; x += step, n += step * step) { \
2264             res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2265                                   16 * step * step, c, e, p, a[x] + l[y], \
2266                                   uvscan, uvnb, uv_band_counts, qmul[1]); \
2267             a[x] = l[y] = !!res; \
2268             if (step >= 4) { \
2269                 AV_WN16A(&s->uveob[pl][n], res); \
2270             } else { \
2271                 s->uveob[pl][n] = res; \
2272             } \
2273         } \
2274     }
2275 
2276     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2277     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2278     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2279     w4 >>= 1;
2280     h4 >>= 1;
2281     end_x >>= 1;
2282     end_y >>= 1;
2283     for (pl = 0; pl < 2; pl++) {
2284         a = &s->above_uv_nnz_ctx[pl][col];
2285         l = &s->left_uv_nnz_ctx[pl][row & 7];
2286         switch (b->uvtx) {
2287         case TX_4X4:
2288             DECODE_UV_COEF_LOOP(1);
2289             break;
2290         case TX_8X8:
2291             MERGE_CTX(2, AV_RN16A);
2292             DECODE_UV_COEF_LOOP(2);
2293             SPLAT_CTX(2);
2294             break;
2295         case TX_16X16:
2296             MERGE_CTX(4, AV_RN32A);
2297             DECODE_UV_COEF_LOOP(4);
2298             SPLAT_CTX(4);
2299             break;
2300         case TX_32X32:
2301             MERGE_CTX(8, AV_RN64A);
2302             // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2303             // so there is no need to loop
2304             res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2305                                     1024, c, e, p, a[0] + l[0],
2306                                     uvscan, uvnb, uv_band_counts, qmul[1]);
2307             a[0] = l[0] = !!res;
2308             AV_WN16A(&s->uveob[pl][0], res);
2309             SPLAT_CTX(8);
2310             break;
2311         }
2312     }
2313 }
2314 
check_intra_mode(VP9Context * s,int mode,uint8_t ** a,uint8_t * dst_edge,ptrdiff_t stride_edge,uint8_t * dst_inner,ptrdiff_t stride_inner,uint8_t * l,int col,int x,int w,int row,int y,enum TxfmMode tx,int p)2315 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2316                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2317                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2318                                              uint8_t *l, int col, int x, int w,
2319                                              int row, int y, enum TxfmMode tx,
2320                                              int p)
2321 {
2322     int have_top = row > 0 || y > 0;
2323     int have_left = col > s->tiling.tile_col_start || x > 0;
2324     int have_right = x < w - 1;
2325     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2326 		[VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2327                                    { DC_127_PRED,          VERT_PRED } },
2328         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2329                                    { HOR_PRED,             HOR_PRED } },
2330         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2331                                    { LEFT_DC_PRED,         DC_PRED } },
2332         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2333                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2334         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2335                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2336         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2337                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2338         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2339                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2340         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2341                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2342         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2343                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2344         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2345                                    { HOR_PRED,             TM_VP8_PRED } },
2346 	};
2347     static const struct {
2348         uint8_t needs_left:1;
2349         uint8_t needs_top:1;
2350         uint8_t needs_topleft:1;
2351         uint8_t needs_topright:1;
2352     } edges[N_INTRA_PRED_MODES] = {
2353 		[VERT_PRED]            = { .needs_top  = 1 },
2354         [HOR_PRED]             = { .needs_left = 1 },
2355         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2356         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2357         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2358         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2359         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2360         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2361         [HOR_UP_PRED]          = { .needs_left = 1 },
2362         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2363         [LEFT_DC_PRED]         = { .needs_left = 1 },
2364         [TOP_DC_PRED]          = { .needs_top  = 1 },
2365         [DC_128_PRED]          = { 0 },
2366         [DC_127_PRED]          = { 0 },
2367         [DC_129_PRED]          = { 0 }
2368 	};
2369 
2370     av_assert2(mode >= 0 && mode < 10);
2371     mode = mode_conv[mode][have_left][have_top];
2372     if (edges[mode].needs_top) {
2373         uint8_t *top = NULL;
2374         uint8_t *topleft = NULL;
2375         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2376         int n_px_need_tr = 0;
2377 
2378         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2379             n_px_need_tr = 4;
2380 
2381         // if top of sb64-row, use s->intra_pred_data[] instead of
2382         // dst[-stride] for intra prediction (it contains pre- instead of
2383         // post-loopfilter data)
2384         if (have_top) {
2385             top = !(row & 7) && !y ?
2386                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2387                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2388             if (have_left)
2389                 topleft = !(row & 7) && !y ?
2390                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2391                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2392                     &dst_inner[-stride_inner];
2393         }
2394 
2395         if (have_top &&
2396             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2397             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2398             n_px_need + n_px_need_tr <= n_px_have) {
2399             *a = top;
2400         } else {
2401             if (have_top) {
2402                 if (n_px_need <= n_px_have) {
2403                     memcpy(*a, top, n_px_need);
2404                 } else {
2405                     memcpy(*a, top, n_px_have);
2406                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2407                            n_px_need - n_px_have);
2408                 }
2409             } else {
2410                 memset(*a, 127, n_px_need);
2411             }
2412             if (edges[mode].needs_topleft) {
2413                 if (have_left && have_top) {
2414                     (*a)[-1] = topleft[-1];
2415                 } else {
2416                     (*a)[-1] = have_top ? 129 : 127;
2417                 }
2418             }
2419             if (tx == TX_4X4 && edges[mode].needs_topright) {
2420                 if (have_top && have_right &&
2421                     n_px_need + n_px_need_tr <= n_px_have) {
2422                     memcpy(&(*a)[4], &top[4], 4);
2423                 } else {
2424                     memset(&(*a)[4], (*a)[3], 4);
2425                 }
2426             }
2427         }
2428     }
2429     if (edges[mode].needs_left) {
2430         if (have_left) {
2431             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2432             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2433             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2434 
2435             if (n_px_need <= n_px_have) {
2436                 for (i = 0; i < n_px_need; i++)
2437                     l[n_px_need - 1 - i] = dst[i * stride - 1];
2438             } else {
2439                 for (i = 0; i < n_px_have; i++)
2440                     l[n_px_need - 1 - i] = dst[i * stride - 1];
2441                 memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2442             }
2443         } else {
2444             memset(l, 129, 4 << tx);
2445         }
2446     }
2447 
2448     return mode;
2449 }
2450 
intra_recon(AVCodecContext * ctx,ptrdiff_t y_off,ptrdiff_t uv_off)2451 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2452 {
2453     VP9Context *s = ctx->priv_data;
2454     VP9Block *b = s->b;
2455     int row = s->row, col = s->col;
2456     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2457     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2458     int end_x = FFMIN(2 * (s->cols - col), w4);
2459     int end_y = FFMIN(2 * (s->rows - row), h4);
2460     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2461     int uvstep1d = 1 << b->uvtx, p;
2462     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2463 
2464 	LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2465     LOCAL_ALIGNED_32(uint8_t, l, [32]);
2466 
2467     for (n = 0, y = 0; y < end_y; y += step1d) {
2468         uint8_t *ptr = dst, *ptr_r = dst_r;
2469         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2470                                ptr_r += 4 * step1d, n += step) {
2471             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2472                                y * 2 + x : 0];
2473             uint8_t *a = &a_buf[32];
2474             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2475             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2476 
2477             mode = check_intra_mode(s, mode, &a, ptr_r,
2478                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2479                                     ptr, s->y_stride, l,
2480                                     col, x, w4, row, y, b->tx, 0);
2481             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2482             if (eob)
2483                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2484                                            s->block + 16 * n, eob);
2485         }
2486         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2487         dst   += 4 * step1d * s->y_stride;
2488     }
2489 
2490     // U/V
2491     w4 >>= 1;
2492     end_x >>= 1;
2493     end_y >>= 1;
2494     step = 1 << (b->uvtx * 2);
2495     for (p = 0; p < 2; p++) {
2496         dst   = s->dst[1 + p];
2497         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2498         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2499             uint8_t *ptr = dst, *ptr_r = dst_r;
2500             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2501                                    ptr_r += 4 * uvstep1d, n += step) {
2502                 int mode = b->uvmode;
2503                 uint8_t *a = &a_buf[16];
2504                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2505 
2506                 mode = check_intra_mode(s, mode, &a, ptr_r,
2507                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2508                                         ptr, s->uv_stride, l,
2509                                         col, x, w4, row, y, b->uvtx, p + 1);
2510                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2511                 if (eob)
2512                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2513                                                     s->uvblock[p] + 16 * n, eob);
2514             }
2515             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2516             dst   += 4 * uvstep1d * s->uv_stride;
2517         }
2518     }
2519 }
2520 
mc_luma_dir(VP9Context * s,vp9_mc_func (* mc)[2],uint8_t * dst,ptrdiff_t dst_stride,const uint8_t * ref,ptrdiff_t ref_stride,ThreadFrame * ref_frame,ptrdiff_t y,ptrdiff_t x,const VP56mv * mv,int bw,int bh,int w,int h)2521 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2522                                          uint8_t *dst, ptrdiff_t dst_stride,
2523                                          const uint8_t *ref, ptrdiff_t ref_stride,
2524                                          ThreadFrame *ref_frame,
2525                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2526                                          int bw, int bh, int w, int h)
2527 {
2528     int mx = mv->x, my = mv->y, th;
2529 
2530     y += my >> 3;
2531     x += mx >> 3;
2532     ref += y * ref_stride + x;
2533     mx &= 7;
2534     my &= 7;
2535     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2536     // we use +7 because the last 7 pixels of each sbrow can be changed in
2537     // the longest loopfilter of the next sbrow
2538     th = (y + bh + 4 * !!my + 7) >> 6;
2539     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2540     if (x < !!mx * 3 || y < !!my * 3 ||
2541         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2542         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2543                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2544                                  80, ref_stride,
2545                                  bw + !!mx * 7, bh + !!my * 7,
2546                                  x - !!mx * 3, y - !!my * 3, w, h);
2547         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2548         ref_stride = 80;
2549     }
2550     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2551 }
2552 
mc_chroma_dir(VP9Context * s,vp9_mc_func (* mc)[2],uint8_t * dst_u,uint8_t * dst_v,ptrdiff_t dst_stride,const uint8_t * ref_u,ptrdiff_t src_stride_u,const uint8_t * ref_v,ptrdiff_t src_stride_v,ThreadFrame * ref_frame,ptrdiff_t y,ptrdiff_t x,const VP56mv * mv,int bw,int bh,int w,int h)2553 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2554                                            uint8_t *dst_u, uint8_t *dst_v,
2555                                            ptrdiff_t dst_stride,
2556                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2557                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2558                                            ThreadFrame *ref_frame,
2559                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2560                                            int bw, int bh, int w, int h)
2561 {
2562     int mx = mv->x, my = mv->y, th;
2563 
2564     y += my >> 4;
2565     x += mx >> 4;
2566     ref_u += y * src_stride_u + x;
2567     ref_v += y * src_stride_v + x;
2568     mx &= 15;
2569     my &= 15;
2570     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2571     // we use +7 because the last 7 pixels of each sbrow can be changed in
2572     // the longest loopfilter of the next sbrow
2573     th = (y + bh + 4 * !!my + 7) >> 5;
2574     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2575     if (x < !!mx * 3 || y < !!my * 3 ||
2576         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2577         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2578                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2579                                  80, src_stride_u,
2580                                  bw + !!mx * 7, bh + !!my * 7,
2581                                  x - !!mx * 3, y - !!my * 3, w, h);
2582         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2583         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2584 
2585         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2586                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2587                                  80, src_stride_v,
2588                                  bw + !!mx * 7, bh + !!my * 7,
2589                                  x - !!mx * 3, y - !!my * 3, w, h);
2590         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2591         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2592     } else {
2593         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2594         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2595     }
2596 }
2597 
inter_recon(AVCodecContext * ctx)2598 static void inter_recon(AVCodecContext *ctx)
2599 {
2600     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2601         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2602         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2603     };
2604     VP9Context *s = ctx->priv_data;
2605     VP9Block *b = s->b;
2606     int row = s->row, col = s->col;
2607     ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]];
2608     ThreadFrame *tref2 = NULL;
2609     AVFrame *ref1 = tref1->f;
2610     AVFrame *ref2 = NULL;
2611     int w1 = ref1->width, h1 = ref1->height;
2612     int w2 = 0;
2613     int h2 = 0;
2614     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2615 
2616     if (b->comp) {
2617         tref2 = &s->refs[s->refidx[b->ref[1]]];
2618         ref2 = tref2->f;
2619         w2 = ref2->width;
2620         h2 = ref2->height;
2621     }
2622 
2623     // y inter pred
2624     if (b->bs > BS_8x8) {
2625         if (b->bs == BS_8x4) {
2626             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2627                         ref1->data[0], ref1->linesize[0], tref1,
2628                         row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2629             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2630                         s->dst[0] + 4 * ls_y, ls_y,
2631                         ref1->data[0], ref1->linesize[0], tref1,
2632                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2633 
2634             if (b->comp) {
2635                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2636                             ref2->data[0], ref2->linesize[0], tref2,
2637                             row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2638                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2639                             s->dst[0] + 4 * ls_y, ls_y,
2640                             ref2->data[0], ref2->linesize[0], tref2,
2641                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2642             }
2643         } else if (b->bs == BS_4x8) {
2644             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2645                         ref1->data[0], ref1->linesize[0], tref1,
2646                         row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2647             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2648                         ref1->data[0], ref1->linesize[0], tref1,
2649                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2650 
2651             if (b->comp) {
2652                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2653                             ref2->data[0], ref2->linesize[0], tref2,
2654                             row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2655                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2656                             ref2->data[0], ref2->linesize[0], tref2,
2657                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2658             }
2659         } else {
2660             av_assert2(b->bs == BS_4x4);
2661 
2662             // FIXME if two horizontally adjacent blocks have the same MV,
2663             // do a w8 instead of a w4 call
2664             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2665                         ref1->data[0], ref1->linesize[0], tref1,
2666                         row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2667             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2668                         ref1->data[0], ref1->linesize[0], tref1,
2669                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2670             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2671                         s->dst[0] + 4 * ls_y, ls_y,
2672                         ref1->data[0], ref1->linesize[0], tref1,
2673                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2674             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2675                         s->dst[0] + 4 * ls_y + 4, ls_y,
2676                         ref1->data[0], ref1->linesize[0], tref1,
2677                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2678 
2679             if (b->comp) {
2680                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2681                             ref2->data[0], ref2->linesize[0], tref2,
2682                             row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2683                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2684                             ref2->data[0], ref2->linesize[0], tref2,
2685                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2686                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2687                             s->dst[0] + 4 * ls_y, ls_y,
2688                             ref2->data[0], ref2->linesize[0], tref2,
2689                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2690                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2691                             s->dst[0] + 4 * ls_y + 4, ls_y,
2692                             ref2->data[0], ref2->linesize[0], tref2,
2693                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2694             }
2695         }
2696     } else {
2697         int bwl = bwlog_tab[0][b->bs];
2698         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2699 
2700         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2701                     ref1->data[0], ref1->linesize[0], tref1,
2702                     row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2703 
2704         if (b->comp)
2705             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2706                         ref2->data[0], ref2->linesize[0], tref2,
2707                         row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2708     }
2709 
2710     // uv inter pred
2711     {
2712         int bwl = bwlog_tab[1][b->bs];
2713         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2714         VP56mv mvuv;
2715 
2716         w1 = (w1 + 1) >> 1;
2717         h1 = (h1 + 1) >> 1;
2718         if (b->comp) {
2719             w2 = (w2 + 1) >> 1;
2720             h2 = (h2 + 1) >> 1;
2721         }
2722         if (b->bs > BS_8x8) {
2723             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2724             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2725         } else {
2726             mvuv = b->mv[0][0];
2727         }
2728 
2729         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2730                       s->dst[1], s->dst[2], ls_uv,
2731                       ref1->data[1], ref1->linesize[1],
2732                       ref1->data[2], ref1->linesize[2], tref1,
2733                       row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2734 
2735         if (b->comp) {
2736             if (b->bs > BS_8x8) {
2737                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2738                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2739             } else {
2740                 mvuv = b->mv[0][1];
2741             }
2742             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2743                           s->dst[1], s->dst[2], ls_uv,
2744                           ref2->data[1], ref2->linesize[1],
2745                           ref2->data[2], ref2->linesize[2], tref2,
2746                           row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2747         }
2748     }
2749 
2750     if (!b->skip) {
2751         /* mostly copied intra_reconn() */
2752 
2753         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2754         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2755         int end_x = FFMIN(2 * (s->cols - col), w4);
2756         int end_y = FFMIN(2 * (s->rows - row), h4);
2757         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2758         int uvstep1d = 1 << b->uvtx, p;
2759         uint8_t *dst = s->dst[0];
2760 
2761         // y itxfm add
2762         for (n = 0, y = 0; y < end_y; y += step1d) {
2763             uint8_t *ptr = dst;
2764             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2765                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2766 
2767                 if (eob)
2768                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2769                                                   s->block + 16 * n, eob);
2770             }
2771             dst += 4 * s->y_stride * step1d;
2772         }
2773 
2774         // uv itxfm add
2775         end_x >>= 1;
2776         end_y >>= 1;
2777         step = 1 << (b->uvtx * 2);
2778         for (p = 0; p < 2; p++) {
2779             dst = s->dst[p + 1];
2780             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2781                 uint8_t *ptr = dst;
2782                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2783                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2784 
2785                     if (eob)
2786                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2787                                                         s->uvblock[p] + 16 * n, eob);
2788                 }
2789                 dst += 4 * uvstep1d * s->uv_stride;
2790             }
2791         }
2792     }
2793 }
2794 
mask_edges(struct VP9Filter * lflvl,int is_uv,int row_and_7,int col_and_7,int w,int h,int col_end,int row_end,enum TxfmMode tx,int skip_inter)2795 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2796                                         int row_and_7, int col_and_7,
2797                                         int w, int h, int col_end, int row_end,
2798                                         enum TxfmMode tx, int skip_inter)
2799 {
2800     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2801     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2802     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2803     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2804 
2805     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2806     // edges. This means that for UV, we work on two subsampled blocks at
2807     // a time, and we only use the topleft block's mode information to set
2808     // things like block strength. Thus, for any block size smaller than
2809     // 16x16, ignore the odd portion of the block.
2810     if (tx == TX_4X4 && is_uv) {
2811         if (h == 1) {
2812             if (row_and_7 & 1)
2813                 return;
2814             if (!row_end)
2815                 h += 1;
2816         }
2817         if (w == 1) {
2818             if (col_and_7 & 1)
2819                 return;
2820             if (!col_end)
2821                 w += 1;
2822         }
2823     }
2824 
2825     if (tx == TX_4X4 && !skip_inter) {
2826         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2827         int m_col_odd = (t << (w - 1)) - t;
2828 
2829         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2830         if (is_uv) {
2831             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2832 
2833             for (y = row_and_7; y < h + row_and_7; y++) {
2834                 int col_mask_id = 2 - !(y & 7);
2835 
2836                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2837                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2838                 // for odd lines, if the odd col is not being filtered,
2839                 // skip odd row also:
2840                 // .---. <-- a
2841                 // |   |
2842                 // |___| <-- b
2843                 // ^   ^
2844                 // c   d
2845                 //
2846                 // if a/c are even row/col and b/d are odd, and d is skipped,
2847                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2848                 if ((col_end & 1) && (y & 1)) {
2849                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2850                 } else {
2851                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2852                 }
2853             }
2854         } else {
2855             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2856 
2857             for (y = row_and_7; y < h + row_and_7; y++) {
2858                 int col_mask_id = 2 - !(y & 3);
2859 
2860                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2861                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2862                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2863                 lflvl->mask[is_uv][0][y][3] |= m_col;
2864                 lflvl->mask[is_uv][1][y][3] |= m_col;
2865             }
2866         }
2867     } else {
2868         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2869 
2870         if (!skip_inter) {
2871             int mask_id = (tx == TX_8X8);
2872             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2873             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2874             int m_row = m_col & masks[l2];
2875 
2876             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2877             // 8wd loopfilter to prevent going off the visible edge.
2878             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2879                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2880                 int m_row_8 = m_row - m_row_16;
2881 
2882                 for (y = row_and_7; y < h + row_and_7; y++) {
2883                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2884                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2885                 }
2886             } else {
2887                 for (y = row_and_7; y < h + row_and_7; y++)
2888                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2889             }
2890 
2891             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2892                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2893                     lflvl->mask[is_uv][1][y][0] |= m_col;
2894                 if (y - row_and_7 == h - 1)
2895                     lflvl->mask[is_uv][1][y][1] |= m_col;
2896             } else {
2897                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2898                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2899             }
2900         } else if (tx != TX_4X4) {
2901             int mask_id;
2902 
2903             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2904             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2905             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2906             for (y = row_and_7; y < h + row_and_7; y++)
2907                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2908         } else if (is_uv) {
2909             int t8 = t & 0x01, t4 = t - t8;
2910 
2911             for (y = row_and_7; y < h + row_and_7; y++) {
2912                 lflvl->mask[is_uv][0][y][2] |= t4;
2913                 lflvl->mask[is_uv][0][y][1] |= t8;
2914             }
2915             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2916         } else {
2917             int t8 = t & 0x11, t4 = t - t8;
2918 
2919             for (y = row_and_7; y < h + row_and_7; y++) {
2920                 lflvl->mask[is_uv][0][y][2] |= t4;
2921                 lflvl->mask[is_uv][0][y][1] |= t8;
2922             }
2923             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2924         }
2925     }
2926 }
2927 
decode_b(AVCodecContext * ctx,int row,int col,struct VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl,enum BlockPartition bp)2928 static void decode_b(AVCodecContext *ctx, int row, int col,
2929                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2930                      enum BlockLevel bl, enum BlockPartition bp)
2931 {
2932     VP9Context *s = ctx->priv_data;
2933     VP9Block *b = s->b;
2934     enum BlockSize bs = bl * 3 + bp;
2935     int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2936     int emu[2];
2937     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2938 
2939     s->row = row;
2940     s->row7 = row & 7;
2941     s->col = col;
2942     s->col7 = col & 7;
2943     s->min_mv.x = -(128 + col * 64);
2944     s->min_mv.y = -(128 + row * 64);
2945     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2946     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2947     if (s->pass < 2) {
2948         b->bs = bs;
2949         b->bl = bl;
2950         b->bp = bp;
2951         decode_mode(ctx);
2952         b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2953 
2954         if (!b->skip) {
2955             decode_coeffs(ctx);
2956         } else {
2957             int row7 = s->row7;
2958 
2959 #define SPLAT_ZERO_CTX(v, n) \
2960     switch (n) { \
2961     case 1:  v = 0;          break; \
2962     case 2:  AV_ZERO16(&v);  break; \
2963     case 4:  AV_ZERO32(&v);  break; \
2964     case 8:  AV_ZERO64(&v);  break; \
2965     case 16: AV_ZERO128(&v); break; \
2966     }
2967 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2968     do { \
2969         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2970         SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2971         SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2972     } while (0)
2973 
2974             switch (w4) {
2975             case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2976             case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2977             case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2978             case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2979             }
2980             switch (h4) {
2981             case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2982             case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2983             case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2984             case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2985             }
2986         }
2987         if (s->pass == 1) {
2988             s->b++;
2989             s->block += w4 * h4 * 64;
2990             s->uvblock[0] += w4 * h4 * 16;
2991             s->uvblock[1] += w4 * h4 * 16;
2992             s->eob += 4 * w4 * h4;
2993             s->uveob[0] += w4 * h4;
2994             s->uveob[1] += w4 * h4;
2995 
2996             return;
2997         }
2998     }
2999 
3000     // emulated overhangs if the stride of the target buffer can't hold. This
3001     // allows to support emu-edge and so on even if we have large block
3002     // overhangs
3003     emu[0] = (col + w4) * 8 > f->linesize[0] ||
3004              (row + h4) > s->rows;
3005     emu[1] = (col + w4) * 4 > f->linesize[1] ||
3006              (row + h4) > s->rows;
3007     if (emu[0]) {
3008         s->dst[0] = s->tmp_y;
3009         s->y_stride = 64;
3010     } else {
3011         s->dst[0] = f->data[0] + yoff;
3012         s->y_stride = f->linesize[0];
3013     }
3014     if (emu[1]) {
3015         s->dst[1] = s->tmp_uv[0];
3016         s->dst[2] = s->tmp_uv[1];
3017         s->uv_stride = 32;
3018     } else {
3019         s->dst[1] = f->data[1] + uvoff;
3020         s->dst[2] = f->data[2] + uvoff;
3021         s->uv_stride = f->linesize[1];
3022     }
3023     if (b->intra) {
3024         intra_recon(ctx, yoff, uvoff);
3025     } else {
3026         inter_recon(ctx);
3027     }
3028     if (emu[0]) {
3029         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3030 
3031         for (n = 0; o < w; n++) {
3032             int bw = 64 >> n;
3033 
3034             av_assert2(n <= 4);
3035             if (w & bw) {
3036                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3037                                          s->tmp_y + o, 64, h, 0, 0);
3038                 o += bw;
3039             }
3040         }
3041     }
3042     if (emu[1]) {
3043         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3044 
3045         for (n = 1; o < w; n++) {
3046             int bw = 64 >> n;
3047 
3048             av_assert2(n <= 4);
3049             if (w & bw) {
3050                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3051                                          s->tmp_uv[0] + o, 32, h, 0, 0);
3052                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3053                                          s->tmp_uv[1] + o, 32, h, 0, 0);
3054                 o += bw;
3055             }
3056         }
3057     }
3058 
3059     // pick filter level and find edges to apply filter to
3060     if (s->filter.level &&
3061         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3062                                                     [b->mode[3] != ZEROMV]) > 0) {
3063         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3064         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3065 
3066         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3067         mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3068         mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3069                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3070                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3071                    b->uvtx, skip_inter);
3072 
3073         if (!s->filter.lim_lut[lvl]) {
3074             int sharp = s->filter.sharpness;
3075             int limit = lvl;
3076 
3077             if (sharp > 0) {
3078                 limit >>= (sharp + 3) >> 2;
3079                 limit = FFMIN(limit, 9 - sharp);
3080             }
3081             limit = FFMAX(limit, 1);
3082 
3083             s->filter.lim_lut[lvl] = limit;
3084             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3085         }
3086     }
3087 
3088     if (s->pass == 2) {
3089         s->b++;
3090         s->block += w4 * h4 * 64;
3091         s->uvblock[0] += w4 * h4 * 16;
3092         s->uvblock[1] += w4 * h4 * 16;
3093         s->eob += 4 * w4 * h4;
3094         s->uveob[0] += w4 * h4;
3095         s->uveob[1] += w4 * h4;
3096     }
3097 }
3098 
decode_sb(AVCodecContext * ctx,int row,int col,struct VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl)3099 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3100                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3101 {
3102     VP9Context *s = ctx->priv_data;
3103     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3104             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3105     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3106                                      s->prob.p.partition[bl][c];
3107     enum BlockPartition bp;
3108     ptrdiff_t hbs = 4 >> bl;
3109     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3110     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3111 
3112     if (bl == BL_8X8) {
3113         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3114         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3115     } else if (col + hbs < s->cols) { // FIXME why not <=?
3116         if (row + hbs < s->rows) { // FIXME why not <=?
3117             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3118             switch (bp) {
3119             case PARTITION_NONE:
3120                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3121                 break;
3122             case PARTITION_H:
3123                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3124                 yoff  += hbs * 8 * y_stride;
3125                 uvoff += hbs * 4 * uv_stride;
3126                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3127                 break;
3128             case PARTITION_V:
3129                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3130                 yoff  += hbs * 8;
3131                 uvoff += hbs * 4;
3132                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3133                 break;
3134             case PARTITION_SPLIT:
3135                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3136                 decode_sb(ctx, row, col + hbs, lflvl,
3137                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3138                 yoff  += hbs * 8 * y_stride;
3139                 uvoff += hbs * 4 * uv_stride;
3140                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3141                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3142                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3143                 break;
3144             default:
3145                 av_assert0(0);
3146             }
3147         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3148             bp = PARTITION_SPLIT;
3149             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3150             decode_sb(ctx, row, col + hbs, lflvl,
3151                       yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3152         } else {
3153             bp = PARTITION_H;
3154             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3155         }
3156     } else if (row + hbs < s->rows) { // FIXME why not <=?
3157         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3158             bp = PARTITION_SPLIT;
3159             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3160             yoff  += hbs * 8 * y_stride;
3161             uvoff += hbs * 4 * uv_stride;
3162             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3163         } else {
3164             bp = PARTITION_V;
3165             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3166         }
3167     } else {
3168         bp = PARTITION_SPLIT;
3169         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3170     }
3171     s->counts.partition[bl][c][bp]++;
3172 }
3173 
decode_sb_mem(AVCodecContext * ctx,int row,int col,struct VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl)3174 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3175                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3176 {
3177     VP9Context *s = ctx->priv_data;
3178     VP9Block *b = s->b;
3179     ptrdiff_t hbs = 4 >> bl;
3180     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3181     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3182 
3183     if (bl == BL_8X8) {
3184         av_assert2(b->bl == BL_8X8);
3185         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3186     } else if (s->b->bl == bl) {
3187         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3188         if (b->bp == PARTITION_H && row + hbs < s->rows) {
3189             yoff  += hbs * 8 * y_stride;
3190             uvoff += hbs * 4 * uv_stride;
3191             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3192         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3193             yoff  += hbs * 8;
3194             uvoff += hbs * 4;
3195             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3196         }
3197     } else {
3198         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3199         if (col + hbs < s->cols) { // FIXME why not <=?
3200             if (row + hbs < s->rows) {
3201                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3202                               uvoff + 4 * hbs, bl + 1);
3203                 yoff  += hbs * 8 * y_stride;
3204                 uvoff += hbs * 4 * uv_stride;
3205                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3206                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3207                                     yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3208             } else {
3209                 yoff  += hbs * 8;
3210                 uvoff += hbs * 4;
3211                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3212             }
3213         } else if (row + hbs < s->rows) {
3214             yoff  += hbs * 8 * y_stride;
3215             uvoff += hbs * 4 * uv_stride;
3216             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3217         }
3218     }
3219 }
3220 
loopfilter_sb(AVCodecContext * ctx,struct VP9Filter * lflvl,int row,int col,ptrdiff_t yoff,ptrdiff_t uvoff)3221 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3222                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3223 {
3224     VP9Context *s = ctx->priv_data;
3225     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3226     uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3227     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3228     int y, x, p;
3229 
3230     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3231     // if you think of them as acting on a 8x8 block max, we can interleave
3232     // each v/h within the single x loop, but that only works if we work on
3233     // 8 pixel blocks, and we won't always do that (we want at least 16px
3234     // to use SSE2 optimizations, perhaps 32 for AVX2)
3235 
3236     // filter edges between columns, Y plane (e.g. block1 | block2)
3237     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3238         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3239         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3240         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3241         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3242         unsigned hm = hm1 | hm2 | hm13 | hm23;
3243 
3244         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3245             if (hm1 & x) {
3246                 int L = *l, H = L >> 4;
3247                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3248 
3249                 if (col || x > 1) {
3250                     if (hmask1[0] & x) {
3251                         if (hmask2[0] & x) {
3252                             av_assert2(l[8] == L);
3253                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3254                         } else {
3255                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3256                         }
3257                     } else if (hm2 & x) {
3258                         L = l[8];
3259                         H |= (L >> 4) << 8;
3260                         E |= s->filter.mblim_lut[L] << 8;
3261                         I |= s->filter.lim_lut[L] << 8;
3262                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3263                                                [!!(hmask2[1] & x)]
3264                                                [0](ptr, ls_y, E, I, H);
3265                     } else {
3266                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3267                                             [0](ptr, ls_y, E, I, H);
3268                     }
3269                 }
3270             } else if (hm2 & x) {
3271                 int L = l[8], H = L >> 4;
3272                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3273 
3274                 if (col || x > 1) {
3275                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3276                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
3277                 }
3278             }
3279             if (hm13 & x) {
3280                 int L = *l, H = L >> 4;
3281                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3282 
3283                 if (hm23 & x) {
3284                     L = l[8];
3285                     H |= (L >> 4) << 8;
3286                     E |= s->filter.mblim_lut[L] << 8;
3287                     I |= s->filter.lim_lut[L] << 8;
3288                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3289                 } else {
3290                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3291                 }
3292             } else if (hm23 & x) {
3293                 int L = l[8], H = L >> 4;
3294                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3295 
3296                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3297             }
3298         }
3299     }
3300 
3301     //                                          block1
3302     // filter edges between rows, Y plane (e.g. ------)
3303     //                                          block2
3304     dst = f->data[0] + yoff;
3305     lvl = lflvl->level;
3306     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3307         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3308         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3309 
3310         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3311             if (row || y) {
3312                 if (vm & x) {
3313                     int L = *l, H = L >> 4;
3314                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3315 
3316                     if (vmask[0] & x) {
3317                         if (vmask[0] & (x << 1)) {
3318                             av_assert2(l[1] == L);
3319                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3320                         } else {
3321                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3322                         }
3323                     } else if (vm & (x << 1)) {
3324                         L = l[1];
3325                         H |= (L >> 4) << 8;
3326                         E |= s->filter.mblim_lut[L] << 8;
3327                         I |= s->filter.lim_lut[L] << 8;
3328                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3329                                                [!!(vmask[1] & (x << 1))]
3330                                                [1](ptr, ls_y, E, I, H);
3331                     } else {
3332                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3333                                             [1](ptr, ls_y, E, I, H);
3334                     }
3335                 } else if (vm & (x << 1)) {
3336                     int L = l[1], H = L >> 4;
3337                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3338 
3339                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3340                                         [1](ptr + 8, ls_y, E, I, H);
3341                 }
3342             }
3343             if (vm3 & x) {
3344                 int L = *l, H = L >> 4;
3345                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3346 
3347                 if (vm3 & (x << 1)) {
3348                     L = l[1];
3349                     H |= (L >> 4) << 8;
3350                     E |= s->filter.mblim_lut[L] << 8;
3351                     I |= s->filter.lim_lut[L] << 8;
3352                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3353                 } else {
3354                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3355                 }
3356             } else if (vm3 & (x << 1)) {
3357                 int L = l[1], H = L >> 4;
3358                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3359 
3360                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3361             }
3362         }
3363     }
3364 
3365     // same principle but for U/V planes
3366     for (p = 0; p < 2; p++) {
3367         lvl = lflvl->level;
3368         dst = f->data[1 + p] + uvoff;
3369         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3370             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3371             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3372             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3373             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3374 
3375             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3376                 if (col || x > 1) {
3377                     if (hm1 & x) {
3378                         int L = *l, H = L >> 4;
3379                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3380 
3381                         if (hmask1[0] & x) {
3382                             if (hmask2[0] & x) {
3383                                 av_assert2(l[16] == L);
3384                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3385                             } else {
3386                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3387                             }
3388                         } else if (hm2 & x) {
3389                             L = l[16];
3390                             H |= (L >> 4) << 8;
3391                             E |= s->filter.mblim_lut[L] << 8;
3392                             I |= s->filter.lim_lut[L] << 8;
3393                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3394                                                    [!!(hmask2[1] & x)]
3395                                                    [0](ptr, ls_uv, E, I, H);
3396                         } else {
3397                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3398                                                 [0](ptr, ls_uv, E, I, H);
3399                         }
3400                     } else if (hm2 & x) {
3401                         int L = l[16], H = L >> 4;
3402                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3403 
3404                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3405                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3406                     }
3407                 }
3408                 if (x & 0xAA)
3409                     l += 2;
3410             }
3411         }
3412         lvl = lflvl->level;
3413         dst = f->data[1 + p] + uvoff;
3414         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3415             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3416             unsigned vm = vmask[0] | vmask[1] | vmask[2];
3417 
3418             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3419                 if (row || y) {
3420                     if (vm & x) {
3421                         int L = *l, H = L >> 4;
3422                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3423 
3424                         if (vmask[0] & x) {
3425                             if (vmask[0] & (x << 2)) {
3426                                 av_assert2(l[2] == L);
3427                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3428                             } else {
3429                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3430                             }
3431                         } else if (vm & (x << 2)) {
3432                             L = l[2];
3433                             H |= (L >> 4) << 8;
3434                             E |= s->filter.mblim_lut[L] << 8;
3435                             I |= s->filter.lim_lut[L] << 8;
3436                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3437                                                    [!!(vmask[1] & (x << 2))]
3438                                                    [1](ptr, ls_uv, E, I, H);
3439                         } else {
3440                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3441                                                 [1](ptr, ls_uv, E, I, H);
3442                         }
3443                     } else if (vm & (x << 2)) {
3444                         int L = l[2], H = L >> 4;
3445                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3446 
3447                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3448                                             [1](ptr + 8, ls_uv, E, I, H);
3449                     }
3450                 }
3451             }
3452             if (y & 1)
3453                 lvl += 16;
3454         }
3455     }
3456 }
3457 
set_tile_offset(int * start,int * end,int idx,int log2_n,int n)3458 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3459 {
3460     int sb_start = ( idx      * n) >> log2_n;
3461     int sb_end   = ((idx + 1) * n) >> log2_n;
3462     *start = FFMIN(sb_start, n) << 3;
3463     *end   = FFMIN(sb_end,   n) << 3;
3464 }
3465 
adapt_prob(uint8_t * p,unsigned ct0,unsigned ct1,int max_count,int update_factor)3466 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3467                                         int max_count, int update_factor)
3468 {
3469     unsigned ct = ct0 + ct1, p2, p1;
3470 
3471     if (!ct)
3472         return;
3473 
3474     p1 = *p;
3475     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3476     p2 = av_clip(p2, 1, 255);
3477     ct = FFMIN(ct, max_count);
3478     update_factor = FASTDIV(update_factor * ct, max_count);
3479 
3480     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3481     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3482 }
3483 
adapt_probs(VP9Context * s)3484 static void adapt_probs(VP9Context *s)
3485 {
3486     int i, j, k, l, m;
3487     prob_context *p = &s->prob_ctx[s->framectxid].p;
3488     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3489 
3490     // coefficients
3491     for (i = 0; i < 4; i++)
3492         for (j = 0; j < 2; j++)
3493             for (k = 0; k < 2; k++)
3494                 for (l = 0; l < 6; l++)
3495                     for (m = 0; m < 6; m++) {
3496                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3497                         unsigned *e = s->counts.eob[i][j][k][l][m];
3498                         unsigned *c = s->counts.coef[i][j][k][l][m];
3499 
3500                         if (l == 0 && m >= 3) // dc only has 3 pt
3501                             break;
3502 
3503                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3504                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3505                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3506                     }
3507 
3508     if (s->keyframe || s->intraonly) {
3509         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3510         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3511         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3512         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3513         return;
3514     }
3515 
3516     // skip flag
3517     for (i = 0; i < 3; i++)
3518         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3519 
3520     // intra/inter flag
3521     for (i = 0; i < 4; i++)
3522         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3523 
3524     // comppred flag
3525     if (s->comppredmode == PRED_SWITCHABLE) {
3526       for (i = 0; i < 5; i++)
3527           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3528     }
3529 
3530     // reference frames
3531     if (s->comppredmode != PRED_SINGLEREF) {
3532       for (i = 0; i < 5; i++)
3533           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3534                      s->counts.comp_ref[i][1], 20, 128);
3535     }
3536 
3537     if (s->comppredmode != PRED_COMPREF) {
3538       for (i = 0; i < 5; i++) {
3539           uint8_t *pp = p->single_ref[i];
3540           unsigned (*c)[2] = s->counts.single_ref[i];
3541 
3542           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3543           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3544       }
3545     }
3546 
3547     // block partitioning
3548     for (i = 0; i < 4; i++)
3549         for (j = 0; j < 4; j++) {
3550             uint8_t *pp = p->partition[i][j];
3551             unsigned *c = s->counts.partition[i][j];
3552 
3553             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3554             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3555             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3556         }
3557 
3558     // tx size
3559     if (s->txfmmode == TX_SWITCHABLE) {
3560       for (i = 0; i < 2; i++) {
3561           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3562 
3563           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3564           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3565           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3566           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3567           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3568           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3569       }
3570     }
3571 
3572     // interpolation filter
3573     if (s->filtermode == FILTER_SWITCHABLE) {
3574         for (i = 0; i < 4; i++) {
3575             uint8_t *pp = p->filter[i];
3576             unsigned *c = s->counts.filter[i];
3577 
3578             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3579             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3580         }
3581     }
3582 
3583     // inter modes
3584     for (i = 0; i < 7; i++) {
3585         uint8_t *pp = p->mv_mode[i];
3586         unsigned *c = s->counts.mv_mode[i];
3587 
3588         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3589         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3590         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3591     }
3592 
3593     // mv joints
3594     {
3595         uint8_t *pp = p->mv_joint;
3596         unsigned *c = s->counts.mv_joint;
3597 
3598         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3599         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3600         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3601     }
3602 
3603     // mv components
3604     for (i = 0; i < 2; i++) {
3605         uint8_t *pp;
3606         unsigned *c, (*c2)[2], sum;
3607 
3608         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3609                    s->counts.mv_comp[i].sign[1], 20, 128);
3610 
3611         pp = p->mv_comp[i].classes;
3612         c = s->counts.mv_comp[i].classes;
3613         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3614         adapt_prob(&pp[0], c[0], sum, 20, 128);
3615         sum -= c[1];
3616         adapt_prob(&pp[1], c[1], sum, 20, 128);
3617         sum -= c[2] + c[3];
3618         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3619         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3620         sum -= c[4] + c[5];
3621         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3622         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3623         sum -= c[6];
3624         adapt_prob(&pp[6], c[6], sum, 20, 128);
3625         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3626         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3627         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3628 
3629         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3630                    s->counts.mv_comp[i].class0[1], 20, 128);
3631         pp = p->mv_comp[i].bits;
3632         c2 = s->counts.mv_comp[i].bits;
3633         for (j = 0; j < 10; j++)
3634             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3635 
3636         for (j = 0; j < 2; j++) {
3637             pp = p->mv_comp[i].class0_fp[j];
3638             c = s->counts.mv_comp[i].class0_fp[j];
3639             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3640             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3641             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3642         }
3643         pp = p->mv_comp[i].fp;
3644         c = s->counts.mv_comp[i].fp;
3645         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3646         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3647         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3648 
3649         if (s->highprecisionmvs) {
3650             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3651                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3652             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3653                        s->counts.mv_comp[i].hp[1], 20, 128);
3654         }
3655     }
3656 
3657     // y intra modes
3658     for (i = 0; i < 4; i++) {
3659         uint8_t *pp = p->y_mode[i];
3660         unsigned *c = s->counts.y_mode[i], sum, s2;
3661 
3662         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3663         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3664         sum -= c[TM_VP8_PRED];
3665         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3666         sum -= c[VERT_PRED];
3667         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3668         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3669         sum -= s2;
3670         adapt_prob(&pp[3], s2, sum, 20, 128);
3671         s2 -= c[HOR_PRED];
3672         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3673         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3674         sum -= c[DIAG_DOWN_LEFT_PRED];
3675         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3676         sum -= c[VERT_LEFT_PRED];
3677         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3678         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3679     }
3680 
3681     // uv intra modes
3682     for (i = 0; i < 10; i++) {
3683         uint8_t *pp = p->uv_mode[i];
3684         unsigned *c = s->counts.uv_mode[i], sum, s2;
3685 
3686         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3687         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3688         sum -= c[TM_VP8_PRED];
3689         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3690         sum -= c[VERT_PRED];
3691         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3692         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3693         sum -= s2;
3694         adapt_prob(&pp[3], s2, sum, 20, 128);
3695         s2 -= c[HOR_PRED];
3696         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3697         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3698         sum -= c[DIAG_DOWN_LEFT_PRED];
3699         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3700         sum -= c[VERT_LEFT_PRED];
3701         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3702         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3703     }
3704 }
3705 
free_buffers(VP9Context * s)3706 static void free_buffers(VP9Context *s)
3707 {
3708     av_freep(&s->intra_pred_data[0]);
3709     av_freep(&s->b_base);
3710     av_freep(&s->block_base);
3711 }
3712 
vp9_decode_free(AVCodecContext * ctx)3713 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3714 {
3715     VP9Context *s = ctx->priv_data;
3716     int i;
3717 
3718     for (i = 0; i < 2; i++) {
3719         if (s->frames[i].tf.f->data[0])
3720             vp9_unref_frame(ctx, &s->frames[i]);
3721         av_frame_free(&s->frames[i].tf.f);
3722     }
3723     for (i = 0; i < 8; i++) {
3724         if (s->refs[i].f->data[0])
3725             ff_thread_release_buffer(ctx, &s->refs[i]);
3726         av_frame_free(&s->refs[i].f);
3727         if (s->next_refs[i].f->data[0])
3728             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3729         av_frame_free(&s->next_refs[i].f);
3730     }
3731     free_buffers(s);
3732     av_freep(&s->c_b);
3733     s->c_b_size = 0;
3734 
3735     return 0;
3736 }
3737 
3738 
vp9_decode_frame(AVCodecContext * ctx,void * frame,int * got_frame,AVPacket * pkt)3739 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3740                             int *got_frame, AVPacket *pkt)
3741 {
3742     const uint8_t *data = pkt->data;
3743     int size = pkt->size;
3744     VP9Context *s = ctx->priv_data;
3745     int res, tile_row, tile_col, i;
3746     int ref = 0;
3747     int row, col;
3748     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3749     AVFrame *f;
3750 
3751     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3752         return res;
3753     } else if (res == 0) {
3754         if (!s->refs[ref].f->data[0]) {
3755             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3756             return AVERROR_INVALIDDATA;
3757         }
3758         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3759             return res;
3760         *got_frame = 1;
3761         return 0;
3762     }
3763     data += res;
3764     size -= res;
3765 
3766     if (s->frames[LAST_FRAME].tf.f->data[0])
3767         vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3768     if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3769         (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3770         return res;
3771     if (s->frames[CUR_FRAME].tf.f->data[0])
3772         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3773     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3774         return res;
3775     f = s->frames[CUR_FRAME].tf.f;
3776     f->key_frame = s->keyframe;
3777     f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3778     ls_y = f->linesize[0];
3779     ls_uv =f->linesize[1];
3780 
3781     // ref frame setup
3782     for (i = 0; i < 8; i++) {
3783         if (s->next_refs[i].f->data[0])
3784             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3785         if (s->refreshrefmask & (1 << i)) {
3786             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3787         } else {
3788             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3789         }
3790         if (res < 0)
3791             return res;
3792     }
3793 
3794     // main tile decode loop
3795     memset(s->above_partition_ctx, 0, s->cols);
3796     memset(s->above_skip_ctx, 0, s->cols);
3797     if (s->keyframe || s->intraonly) {
3798         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3799     } else {
3800         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3801     }
3802     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3803     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3804     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3805     memset(s->above_segpred_ctx, 0, s->cols);
3806     s->pass = s->uses_2pass =
3807         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3808     if ((res = update_block_buffers(ctx)) < 0) {
3809         av_log(ctx, AV_LOG_ERROR,
3810                "Failed to allocate block buffers\n");
3811         return res;
3812     }
3813     if (s->refreshctx && s->parallelmode) {
3814         int j, k, l, m;
3815 
3816         for (i = 0; i < 4; i++) {
3817             for (j = 0; j < 2; j++)
3818                 for (k = 0; k < 2; k++)
3819                     for (l = 0; l < 6; l++)
3820                         for (m = 0; m < 6; m++)
3821                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3822                                    s->prob.coef[i][j][k][l][m], 3);
3823             if (s->txfmmode == i)
3824                 break;
3825         }
3826         s->prob_ctx[s->framectxid].p = s->prob.p;
3827         ff_thread_finish_setup(ctx);
3828     }
3829 
3830     do {
3831         yoff = uvoff = 0;
3832         s->b = s->b_base;
3833         s->block = s->block_base;
3834         s->uvblock[0] = s->uvblock_base[0];
3835         s->uvblock[1] = s->uvblock_base[1];
3836         s->eob = s->eob_base;
3837         s->uveob[0] = s->uveob_base[0];
3838         s->uveob[1] = s->uveob_base[1];
3839 
3840         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3841             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3842                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3843             if (s->pass != 2) {
3844                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3845                     unsigned tile_size;
3846 
3847                     if (tile_col == s->tiling.tile_cols - 1 &&
3848                         tile_row == s->tiling.tile_rows - 1) {
3849                         tile_size = size;
3850                     } else {
3851                         tile_size = AV_RB32(data);
3852                         data += 4;
3853                         size -= 4;
3854                     }
3855                     if (tile_size > size) {
3856                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3857                         return AVERROR_INVALIDDATA;
3858                     }
3859                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3860                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3861                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3862                         return AVERROR_INVALIDDATA;
3863                     }
3864                     data += tile_size;
3865                     size -= tile_size;
3866                 }
3867             }
3868 
3869             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3870                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3871                 struct VP9Filter *lflvl_ptr = s->lflvl;
3872                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3873 
3874                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3875                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3876                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3877 
3878                     if (s->pass != 2) {
3879                         memset(s->left_partition_ctx, 0, 8);
3880                         memset(s->left_skip_ctx, 0, 8);
3881                         if (s->keyframe || s->intraonly) {
3882                             memset(s->left_mode_ctx, DC_PRED, 16);
3883                         } else {
3884                             memset(s->left_mode_ctx, NEARESTMV, 8);
3885                         }
3886                         memset(s->left_y_nnz_ctx, 0, 16);
3887                         memset(s->left_uv_nnz_ctx, 0, 16);
3888                         memset(s->left_segpred_ctx, 0, 8);
3889 
3890                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3891                     }
3892 
3893                     for (col = s->tiling.tile_col_start;
3894                          col < s->tiling.tile_col_end;
3895                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3896                         // FIXME integrate with lf code (i.e. zero after each
3897                         // use, similar to invtxfm coefficients, or similar)
3898                         if (s->pass != 1) {
3899                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3900                         }
3901 
3902                         if (s->pass == 2) {
3903                             decode_sb_mem(ctx, row, col, lflvl_ptr,
3904                                           yoff2, uvoff2, BL_64X64);
3905                         } else {
3906                             decode_sb(ctx, row, col, lflvl_ptr,
3907                                       yoff2, uvoff2, BL_64X64);
3908                         }
3909                     }
3910                     if (s->pass != 2) {
3911                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3912                     }
3913                 }
3914 
3915                 if (s->pass == 1) {
3916                     continue;
3917                 }
3918 
3919                 // backup pre-loopfilter reconstruction data for intra
3920                 // prediction of next row of sb64s
3921                 if (row + 8 < s->rows) {
3922                     memcpy(s->intra_pred_data[0],
3923                            f->data[0] + yoff + 63 * ls_y,
3924                            8 * s->cols);
3925                     memcpy(s->intra_pred_data[1],
3926                            f->data[1] + uvoff + 31 * ls_uv,
3927                            4 * s->cols);
3928                     memcpy(s->intra_pred_data[2],
3929                            f->data[2] + uvoff + 31 * ls_uv,
3930                            4 * s->cols);
3931                 }
3932 
3933                 // loopfilter one row
3934                 if (s->filter.level) {
3935                     yoff2 = yoff;
3936                     uvoff2 = uvoff;
3937                     lflvl_ptr = s->lflvl;
3938                     for (col = 0; col < s->cols;
3939                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3940                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3941                     }
3942                 }
3943 
3944                 // FIXME maybe we can make this more finegrained by running the
3945                 // loopfilter per-block instead of after each sbrow
3946                 // In fact that would also make intra pred left preparation easier?
3947                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3948             }
3949         }
3950 
3951         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3952             adapt_probs(s);
3953             ff_thread_finish_setup(ctx);
3954         }
3955     } while (s->pass++ == 1);
3956     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3957 
3958     // ref frame setup
3959     for (i = 0; i < 8; i++) {
3960         if (s->refs[i].f->data[0])
3961             ff_thread_release_buffer(ctx, &s->refs[i]);
3962         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3963     }
3964 
3965     if (!s->invisible) {
3966         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3967             return res;
3968         *got_frame = 1;
3969     }
3970 
3971     return 0;
3972 }
3973 
vp9_decode_flush(AVCodecContext * ctx)3974 static void vp9_decode_flush(AVCodecContext *ctx)
3975 {
3976     VP9Context *s = ctx->priv_data;
3977     int i;
3978 
3979     for (i = 0; i < 2; i++)
3980         vp9_unref_frame(ctx, &s->frames[i]);
3981     for (i = 0; i < 8; i++)
3982         ff_thread_release_buffer(ctx, &s->refs[i]);
3983 }
3984 
init_frames(AVCodecContext * ctx)3985 static int init_frames(AVCodecContext *ctx)
3986 {
3987     VP9Context *s = ctx->priv_data;
3988     int i;
3989 
3990     for (i = 0; i < 2; i++) {
3991         s->frames[i].tf.f = av_frame_alloc();
3992         if (!s->frames[i].tf.f) {
3993             vp9_decode_free(ctx);
3994             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3995             return AVERROR(ENOMEM);
3996         }
3997     }
3998     for (i = 0; i < 8; i++) {
3999         s->refs[i].f = av_frame_alloc();
4000         s->next_refs[i].f = av_frame_alloc();
4001         if (!s->refs[i].f || !s->next_refs[i].f) {
4002             vp9_decode_free(ctx);
4003             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4004             return AVERROR(ENOMEM);
4005         }
4006     }
4007 
4008     return 0;
4009 }
4010 
vp9_decode_init(AVCodecContext * ctx)4011 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4012 {
4013     VP9Context *s = ctx->priv_data;
4014 
4015     ctx->internal->allocate_progress = 1;
4016     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4017     ff_vp9dsp_init(&s->dsp);
4018     ff_videodsp_init(&s->vdsp, 8);
4019     s->filter.sharpness = -1;
4020 
4021     return init_frames(ctx);
4022 }
4023 
vp9_decode_init_thread_copy(AVCodecContext * avctx)4024 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4025 {
4026     return init_frames(avctx);
4027 }
4028 
vp9_decode_update_thread_context(AVCodecContext * dst,const AVCodecContext * src)4029 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4030 {
4031     int i, res;
4032     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4033 
4034     // detect size changes in other threads
4035     if (s->intra_pred_data[0] &&
4036         (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4037         free_buffers(s);
4038     }
4039 
4040     for (i = 0; i < 2; i++) {
4041         if (s->frames[i].tf.f->data[0])
4042             vp9_unref_frame(dst, &s->frames[i]);
4043         if (ssrc->frames[i].tf.f->data[0]) {
4044             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4045                 return res;
4046         }
4047     }
4048     for (i = 0; i < 8; i++) {
4049         if (s->refs[i].f->data[0])
4050             ff_thread_release_buffer(dst, &s->refs[i]);
4051         if (ssrc->next_refs[i].f->data[0]) {
4052             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4053                 return res;
4054         }
4055     }
4056 
4057     s->invisible = ssrc->invisible;
4058     s->keyframe = ssrc->keyframe;
4059     s->uses_2pass = ssrc->uses_2pass;
4060     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4061     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4062     if (ssrc->segmentation.enabled) {
4063         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4064                sizeof(s->segmentation.feat));
4065     }
4066 
4067     return 0;
4068 }
4069 
4070 AVCodec ff_vp9_decoder = {
4071 	.name                  = "vp9",
4072     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
4073     .type                  = AVMEDIA_TYPE_VIDEO,
4074     .id                    = AV_CODEC_ID_VP9,
4075     .priv_data_size        = sizeof(VP9Context),
4076     .init                  = vp9_decode_init,
4077     .close                 = vp9_decode_free,
4078     .decode                = vp9_decode_frame,
4079     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4080     .flush                 = vp9_decode_flush,
4081     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4082     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4083 };
4084