2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include "libavutil/avassert.h"
34 #include "libavutil/pixdesc.h"
36 #define VP9_SYNCCODE 0x498342
40 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
41 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
44 typedef struct VP9Block {
45 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
46 enum FilterMode filter;
47 VP56mv mv[4 /* b_idx */][2 /* ref */];
49 enum TxfmMode tx, uvtx;
51 enum BlockPartition bp;
54 typedef struct VP9Context {
65 int row, row7, col, col7;
67 ptrdiff_t y_stride, uv_stride;
70 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
71 uint8_t last_keyframe;
72 enum AVPixelFormat pix_fmt, last_fmt;
73 ThreadFrame next_refs[8];
77 uint8_t mblim_lut[64];
79 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
80 unsigned sb_cols, sb_rows, rows, cols;
83 uint8_t coef[4][2][2][6][6][3];
87 uint8_t coef[4][2][2][6][6][11];
90 unsigned y_mode[4][10];
91 unsigned uv_mode[10][10];
92 unsigned filter[4][3];
93 unsigned mv_mode[7][4];
96 unsigned single_ref[5][2][2];
97 unsigned comp_ref[5][2];
102 unsigned mv_joint[4];
105 unsigned classes[11];
107 unsigned bits[10][2];
108 unsigned class0_fp[2][4];
110 unsigned class0_hp[2];
113 unsigned partition[4][4][4];
114 unsigned coef[4][2][2][6][6][3];
115 unsigned eob[4][2][2][6][6][2];
118 // contextual (left/above) cache
119 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
120 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
121 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
122 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
123 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
124 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
125 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
126 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
127 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
128 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
129 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
131 uint8_t *above_partition_ctx;
132 uint8_t *above_mode_ctx;
133 // FIXME maybe merge some of the below in a flags field?
134 uint8_t *above_y_nnz_ctx;
135 uint8_t *above_uv_nnz_ctx[2];
136 uint8_t *above_skip_ctx; // 1bit
137 uint8_t *above_txfm_ctx; // 2bit
138 uint8_t *above_segpred_ctx; // 1bit
139 uint8_t *above_intra_ctx; // 1bit
140 uint8_t *above_comp_ctx; // 1bit
141 uint8_t *above_ref_ctx; // 2bit
142 uint8_t *above_filter_ctx;
143 VP56mv (*above_mv_ctx)[2];
146 uint8_t *intra_pred_data[3];
147 struct VP9Filter *lflvl;
148 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
150 // block reconstruction intermediates
151 int block_alloc_using_2pass;
152 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
153 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
154 struct { int x, y; } min_mv, max_mv;
155 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
156 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
157 uint16_t mvscale[3][2];
158 uint8_t mvstep[3][2];
161 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
163 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
164 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
166 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
167 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
171 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
173 ff_thread_release_buffer(ctx, &f->tf);
174 av_buffer_unref(&f->extradata);
175 av_buffer_unref(&f->hwaccel_priv_buf);
176 f->segmentation_map = NULL;
177 f->hwaccel_picture_private = NULL;
180 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
182 VP9Context *s = ctx->priv_data;
185 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
187 sz = 64 * s->sb_cols * s->sb_rows;
188 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
192 f->segmentation_map = f->extradata->data;
193 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
196 const AVHWAccel *hwaccel = ctx->hwaccel;
197 av_assert0(!f->hwaccel_picture_private);
198 if (hwaccel->frame_priv_data_size) {
199 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
200 if (!f->hwaccel_priv_buf)
202 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
209 vp9_unref_frame(ctx, f);
210 return AVERROR(ENOMEM);
213 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
217 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
219 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
223 dst->segmentation_map = src->segmentation_map;
225 dst->uses_2pass = src->uses_2pass;
227 if (src->hwaccel_picture_private) {
228 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
229 if (!dst->hwaccel_priv_buf)
231 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
237 vp9_unref_frame(ctx, dst);
238 return AVERROR(ENOMEM);
241 static int update_size(AVCodecContext *ctx, int w, int h)
243 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL)
244 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
245 VP9Context *s = ctx->priv_data;
247 int bytesperpixel = s->bytesperpixel, res;
249 av_assert0(w > 0 && h > 0);
251 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && s->pix_fmt == s->last_fmt)
254 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
257 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
258 #if CONFIG_VP9_DXVA2_HWACCEL
259 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
261 #if CONFIG_VP9_D3D11VA_HWACCEL
262 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
266 *fmtp++ = s->pix_fmt;
267 *fmtp = AV_PIX_FMT_NONE;
269 res = ff_thread_get_format(ctx, pix_fmts);
274 s->last_fmt = s->pix_fmt;
275 s->sb_cols = (w + 63) >> 6;
276 s->sb_rows = (h + 63) >> 6;
277 s->cols = (w + 7) >> 3;
278 s->rows = (h + 7) >> 3;
280 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
281 av_freep(&s->intra_pred_data[0]);
282 // FIXME we slightly over-allocate here for subsampled chroma, but a little
283 // bit of padding shouldn't affect performance...
284 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
285 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
287 return AVERROR(ENOMEM);
288 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
289 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
290 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
291 assign(s->above_y_nnz_ctx, uint8_t *, 16);
292 assign(s->above_mode_ctx, uint8_t *, 16);
293 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
294 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
295 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
296 assign(s->above_partition_ctx, uint8_t *, 8);
297 assign(s->above_skip_ctx, uint8_t *, 8);
298 assign(s->above_txfm_ctx, uint8_t *, 8);
299 assign(s->above_segpred_ctx, uint8_t *, 8);
300 assign(s->above_intra_ctx, uint8_t *, 8);
301 assign(s->above_comp_ctx, uint8_t *, 8);
302 assign(s->above_ref_ctx, uint8_t *, 8);
303 assign(s->above_filter_ctx, uint8_t *, 8);
304 assign(s->lflvl, struct VP9Filter *, 1);
307 // these will be re-allocated a little later
308 av_freep(&s->b_base);
309 av_freep(&s->block_base);
311 if (s->bpp != s->last_bpp) {
312 ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
313 ff_videodsp_init(&s->vdsp, s->bpp);
314 s->last_bpp = s->bpp;
320 static int update_block_buffers(AVCodecContext *ctx)
322 VP9Context *s = ctx->priv_data;
323 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
325 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
329 av_free(s->block_base);
330 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
331 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
332 if (s->s.frames[CUR_FRAME].uses_2pass) {
333 int sbs = s->sb_cols * s->sb_rows;
335 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
336 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
337 16 * 16 + 2 * chroma_eobs) * sbs);
338 if (!s->b_base || !s->block_base)
339 return AVERROR(ENOMEM);
340 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
341 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
342 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
343 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
344 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
346 s->b_base = av_malloc(sizeof(VP9Block));
347 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
348 16 * 16 + 2 * chroma_eobs);
349 if (!s->b_base || !s->block_base)
350 return AVERROR(ENOMEM);
351 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
352 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
353 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
354 s->uveob_base[0] = s->eob_base + 16 * 16;
355 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
357 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
362 // for some reason the sign bit is at the end, not the start, of a bit sequence
363 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
365 int v = get_bits(gb, n);
366 return get_bits1(gb) ? -v : v;
369 static av_always_inline int inv_recenter_nonneg(int v, int m)
371 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
374 // differential forward probability updates
375 static int update_prob(VP56RangeCoder *c, int p)
377 static const int inv_map_table[255] = {
378 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
379 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
380 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
381 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
382 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
383 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
384 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
385 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
386 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
387 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
388 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
389 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
390 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
391 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
392 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
393 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
394 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
395 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
400 /* This code is trying to do a differential probability update. For a
401 * current probability A in the range [1, 255], the difference to a new
402 * probability of any value can be expressed differentially as 1-A,255-A
403 * where some part of this (absolute range) exists both in positive as
404 * well as the negative part, whereas another part only exists in one
405 * half. We're trying to code this shared part differentially, i.e.
406 * times two where the value of the lowest bit specifies the sign, and
407 * the single part is then coded on top of this. This absolute difference
408 * then again has a value of [0,254], but a bigger value in this range
409 * indicates that we're further away from the original value A, so we
410 * can code this as a VLC code, since higher values are increasingly
411 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
412 * updates vs. the 'fine, exact' updates further down the range, which
413 * adds one extra dimension to this differential update model. */
415 if (!vp8_rac_get(c)) {
416 d = vp8_rac_get_uint(c, 4) + 0;
417 } else if (!vp8_rac_get(c)) {
418 d = vp8_rac_get_uint(c, 4) + 16;
419 } else if (!vp8_rac_get(c)) {
420 d = vp8_rac_get_uint(c, 5) + 32;
422 d = vp8_rac_get_uint(c, 7);
424 d = (d << 1) - 65 + vp8_rac_get(c);
426 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
429 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
430 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
433 static int read_colorspace_details(AVCodecContext *ctx)
435 static const enum AVColorSpace colorspaces[8] = {
436 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
437 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
439 VP9Context *s = ctx->priv_data;
440 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
443 s->bpp = 8 + bits * 2;
444 s->bytesperpixel = (7 + s->bpp) >> 3;
445 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
446 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
447 static const enum AVPixelFormat pix_fmt_rgb[3] = {
448 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
450 s->ss_h = s->ss_v = 0;
451 ctx->color_range = AVCOL_RANGE_JPEG;
452 s->pix_fmt = pix_fmt_rgb[bits];
453 if (ctx->profile & 1) {
454 if (get_bits1(&s->gb)) {
455 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
456 return AVERROR_INVALIDDATA;
459 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
461 return AVERROR_INVALIDDATA;
464 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
465 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
466 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
467 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
468 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
469 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
470 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
472 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
473 if (ctx->profile & 1) {
474 s->ss_h = get_bits1(&s->gb);
475 s->ss_v = get_bits1(&s->gb);
476 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
477 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
478 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
480 return AVERROR_INVALIDDATA;
481 } else if (get_bits1(&s->gb)) {
482 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
484 return AVERROR_INVALIDDATA;
487 s->ss_h = s->ss_v = 1;
488 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
495 static int decode_frame_header(AVCodecContext *ctx,
496 const uint8_t *data, int size, int *ref)
498 VP9Context *s = ctx->priv_data;
499 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
501 const uint8_t *data2;
504 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
505 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
508 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
509 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
510 return AVERROR_INVALIDDATA;
512 ctx->profile = get_bits1(&s->gb);
513 ctx->profile |= get_bits1(&s->gb) << 1;
514 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
515 if (ctx->profile > 3) {
516 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
517 return AVERROR_INVALIDDATA;
519 s->s.h.profile = ctx->profile;
520 if (get_bits1(&s->gb)) {
521 *ref = get_bits(&s->gb, 3);
524 s->last_keyframe = s->s.h.keyframe;
525 s->s.h.keyframe = !get_bits1(&s->gb);
526 last_invisible = s->s.h.invisible;
527 s->s.h.invisible = !get_bits1(&s->gb);
528 s->s.h.errorres = get_bits1(&s->gb);
529 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
530 if (s->s.h.keyframe) {
531 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
532 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
533 return AVERROR_INVALIDDATA;
535 if ((res = read_colorspace_details(ctx)) < 0)
537 // for profile 1, here follows the subsampling bits
538 s->s.h.refreshrefmask = 0xff;
539 w = get_bits(&s->gb, 16) + 1;
540 h = get_bits(&s->gb, 16) + 1;
541 if (get_bits1(&s->gb)) // display size
542 skip_bits(&s->gb, 32);
544 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
545 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
546 if (s->s.h.intraonly) {
547 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
548 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
549 return AVERROR_INVALIDDATA;
551 if (ctx->profile >= 1) {
552 if ((res = read_colorspace_details(ctx)) < 0)
555 s->ss_h = s->ss_v = 1;
558 s->bytesperpixel = 1;
559 s->pix_fmt = AV_PIX_FMT_YUV420P;
560 ctx->colorspace = AVCOL_SPC_BT470BG;
561 ctx->color_range = AVCOL_RANGE_JPEG;
563 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
564 w = get_bits(&s->gb, 16) + 1;
565 h = get_bits(&s->gb, 16) + 1;
566 if (get_bits1(&s->gb)) // display size
567 skip_bits(&s->gb, 32);
569 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
570 s->s.h.refidx[0] = get_bits(&s->gb, 3);
571 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
572 s->s.h.refidx[1] = get_bits(&s->gb, 3);
573 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
574 s->s.h.refidx[2] = get_bits(&s->gb, 3);
575 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
576 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
577 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
578 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
579 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
580 return AVERROR_INVALIDDATA;
582 if (get_bits1(&s->gb)) {
583 w = s->s.refs[s->s.h.refidx[0]].f->width;
584 h = s->s.refs[s->s.h.refidx[0]].f->height;
585 } else if (get_bits1(&s->gb)) {
586 w = s->s.refs[s->s.h.refidx[1]].f->width;
587 h = s->s.refs[s->s.h.refidx[1]].f->height;
588 } else if (get_bits1(&s->gb)) {
589 w = s->s.refs[s->s.h.refidx[2]].f->width;
590 h = s->s.refs[s->s.h.refidx[2]].f->height;
592 w = get_bits(&s->gb, 16) + 1;
593 h = get_bits(&s->gb, 16) + 1;
595 // Note that in this code, "CUR_FRAME" is actually before we
596 // have formally allocated a frame, and thus actually represents
598 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
599 s->s.frames[CUR_FRAME].tf.f->height == h;
600 if (get_bits1(&s->gb)) // display size
601 skip_bits(&s->gb, 32);
602 s->s.h.highprecisionmvs = get_bits1(&s->gb);
603 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
605 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
606 s->s.h.signbias[0] != s->s.h.signbias[2];
607 if (s->s.h.allowcompinter) {
608 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
609 s->s.h.fixcompref = 2;
610 s->s.h.varcompref[0] = 0;
611 s->s.h.varcompref[1] = 1;
612 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
613 s->s.h.fixcompref = 1;
614 s->s.h.varcompref[0] = 0;
615 s->s.h.varcompref[1] = 2;
617 s->s.h.fixcompref = 0;
618 s->s.h.varcompref[0] = 1;
619 s->s.h.varcompref[1] = 2;
624 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
625 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
626 s->s.h.framectxid = c = get_bits(&s->gb, 2);
628 /* loopfilter header data */
629 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
630 // reset loopfilter defaults
631 s->s.h.lf_delta.ref[0] = 1;
632 s->s.h.lf_delta.ref[1] = 0;
633 s->s.h.lf_delta.ref[2] = -1;
634 s->s.h.lf_delta.ref[3] = -1;
635 s->s.h.lf_delta.mode[0] = 0;
636 s->s.h.lf_delta.mode[1] = 0;
637 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
639 s->s.h.filter.level = get_bits(&s->gb, 6);
640 sharp = get_bits(&s->gb, 3);
641 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
642 // the old cache values since they are still valid
643 if (s->s.h.filter.sharpness != sharp)
644 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
645 s->s.h.filter.sharpness = sharp;
646 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
647 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
648 for (i = 0; i < 4; i++)
649 if (get_bits1(&s->gb))
650 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
651 for (i = 0; i < 2; i++)
652 if (get_bits1(&s->gb))
653 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
657 /* quantization header data */
658 s->s.h.yac_qi = get_bits(&s->gb, 8);
659 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
660 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
661 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
662 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
663 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
665 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
667 /* segmentation header info */
668 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
669 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
670 for (i = 0; i < 7; i++)
671 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
672 get_bits(&s->gb, 8) : 255;
673 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
674 for (i = 0; i < 3; i++)
675 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
676 get_bits(&s->gb, 8) : 255;
680 if (get_bits1(&s->gb)) {
681 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
682 for (i = 0; i < 8; i++) {
683 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
684 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
685 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
686 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
687 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
688 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
689 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
694 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
695 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
696 int qyac, qydc, quvac, quvdc, lflvl, sh;
698 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
699 if (s->s.h.segmentation.absolute_vals)
700 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
702 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
704 qyac = s->s.h.yac_qi;
706 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
707 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
708 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
709 qyac = av_clip_uintp2(qyac, 8);
711 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
712 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
713 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
714 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
716 sh = s->s.h.filter.level >= 32;
717 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
718 if (s->s.h.segmentation.absolute_vals)
719 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
721 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
723 lflvl = s->s.h.filter.level;
725 if (s->s.h.lf_delta.enabled) {
726 s->s.h.segmentation.feat[i].lflvl[0][0] =
727 s->s.h.segmentation.feat[i].lflvl[0][1] =
728 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] << sh), 6);
729 for (j = 1; j < 4; j++) {
730 s->s.h.segmentation.feat[i].lflvl[j][0] =
731 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
732 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
733 s->s.h.segmentation.feat[i].lflvl[j][1] =
734 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
735 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
738 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
739 sizeof(s->s.h.segmentation.feat[i].lflvl));
744 if ((res = update_size(ctx, w, h)) < 0) {
745 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
749 for (s->s.h.tiling.log2_tile_cols = 0;
750 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
751 s->s.h.tiling.log2_tile_cols++) ;
752 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
753 max = FFMAX(0, max - 1);
754 while (max > s->s.h.tiling.log2_tile_cols) {
755 if (get_bits1(&s->gb))
756 s->s.h.tiling.log2_tile_cols++;
760 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
761 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
762 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
763 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
764 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
765 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
767 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
768 return AVERROR(ENOMEM);
772 /* check reference frames */
773 if (!s->s.h.keyframe && !s->s.h.intraonly) {
774 for (i = 0; i < 3; i++) {
775 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
776 int refw = ref->width, refh = ref->height;
778 if (ref->format != ctx->pix_fmt) {
779 av_log(ctx, AV_LOG_ERROR,
780 "Ref pixfmt (%s) did not match current frame (%s)",
781 av_get_pix_fmt_name(ref->format),
782 av_get_pix_fmt_name(ctx->pix_fmt));
783 return AVERROR_INVALIDDATA;
784 } else if (refw == w && refh == h) {
785 s->mvscale[i][0] = s->mvscale[i][1] = 0;
787 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
788 av_log(ctx, AV_LOG_ERROR,
789 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
791 return AVERROR_INVALIDDATA;
793 s->mvscale[i][0] = (refw << 14) / w;
794 s->mvscale[i][1] = (refh << 14) / h;
795 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
796 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
801 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
802 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
803 s->prob_ctx[3].p = vp9_default_probs;
804 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
805 sizeof(vp9_default_coef_probs));
806 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
807 sizeof(vp9_default_coef_probs));
808 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
809 sizeof(vp9_default_coef_probs));
810 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
811 sizeof(vp9_default_coef_probs));
812 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
813 s->prob_ctx[c].p = vp9_default_probs;
814 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
815 sizeof(vp9_default_coef_probs));
818 // next 16 bits is size of the rest of the header (arith-coded)
819 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
820 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
822 data2 = align_get_bits(&s->gb);
823 if (size2 > size - (data2 - data)) {
824 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
825 return AVERROR_INVALIDDATA;
827 ff_vp56_init_range_decoder(&s->c, data2, size2);
828 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
829 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
830 return AVERROR_INVALIDDATA;
833 if (s->s.h.keyframe || s->s.h.intraonly) {
834 memset(s->counts.coef, 0, sizeof(s->counts.coef));
835 memset(s->counts.eob, 0, sizeof(s->counts.eob));
837 memset(&s->counts, 0, sizeof(s->counts));
839 // FIXME is it faster to not copy here, but do it down in the fw updates
840 // as explicit copies if the fw update is missing (and skip the copy upon
842 s->prob.p = s->prob_ctx[c].p;
845 if (s->s.h.lossless) {
846 s->s.h.txfmmode = TX_4X4;
848 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
849 if (s->s.h.txfmmode == 3)
850 s->s.h.txfmmode += vp8_rac_get(&s->c);
852 if (s->s.h.txfmmode == TX_SWITCHABLE) {
853 for (i = 0; i < 2; i++)
854 if (vp56_rac_get_prob_branchy(&s->c, 252))
855 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
856 for (i = 0; i < 2; i++)
857 for (j = 0; j < 2; j++)
858 if (vp56_rac_get_prob_branchy(&s->c, 252))
859 s->prob.p.tx16p[i][j] =
860 update_prob(&s->c, s->prob.p.tx16p[i][j]);
861 for (i = 0; i < 2; i++)
862 for (j = 0; j < 3; j++)
863 if (vp56_rac_get_prob_branchy(&s->c, 252))
864 s->prob.p.tx32p[i][j] =
865 update_prob(&s->c, s->prob.p.tx32p[i][j]);
870 for (i = 0; i < 4; i++) {
871 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
872 if (vp8_rac_get(&s->c)) {
873 for (j = 0; j < 2; j++)
874 for (k = 0; k < 2; k++)
875 for (l = 0; l < 6; l++)
876 for (m = 0; m < 6; m++) {
877 uint8_t *p = s->prob.coef[i][j][k][l][m];
878 uint8_t *r = ref[j][k][l][m];
879 if (m >= 3 && l == 0) // dc only has 3 pt
881 for (n = 0; n < 3; n++) {
882 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
883 p[n] = update_prob(&s->c, r[n]);
891 for (j = 0; j < 2; j++)
892 for (k = 0; k < 2; k++)
893 for (l = 0; l < 6; l++)
894 for (m = 0; m < 6; m++) {
895 uint8_t *p = s->prob.coef[i][j][k][l][m];
896 uint8_t *r = ref[j][k][l][m];
897 if (m > 3 && l == 0) // dc only has 3 pt
903 if (s->s.h.txfmmode == i)
908 for (i = 0; i < 3; i++)
909 if (vp56_rac_get_prob_branchy(&s->c, 252))
910 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
911 if (!s->s.h.keyframe && !s->s.h.intraonly) {
912 for (i = 0; i < 7; i++)
913 for (j = 0; j < 3; j++)
914 if (vp56_rac_get_prob_branchy(&s->c, 252))
915 s->prob.p.mv_mode[i][j] =
916 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
918 if (s->s.h.filtermode == FILTER_SWITCHABLE)
919 for (i = 0; i < 4; i++)
920 for (j = 0; j < 2; j++)
921 if (vp56_rac_get_prob_branchy(&s->c, 252))
922 s->prob.p.filter[i][j] =
923 update_prob(&s->c, s->prob.p.filter[i][j]);
925 for (i = 0; i < 4; i++)
926 if (vp56_rac_get_prob_branchy(&s->c, 252))
927 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
929 if (s->s.h.allowcompinter) {
930 s->s.h.comppredmode = vp8_rac_get(&s->c);
931 if (s->s.h.comppredmode)
932 s->s.h.comppredmode += vp8_rac_get(&s->c);
933 if (s->s.h.comppredmode == PRED_SWITCHABLE)
934 for (i = 0; i < 5; i++)
935 if (vp56_rac_get_prob_branchy(&s->c, 252))
937 update_prob(&s->c, s->prob.p.comp[i]);
939 s->s.h.comppredmode = PRED_SINGLEREF;
942 if (s->s.h.comppredmode != PRED_COMPREF) {
943 for (i = 0; i < 5; i++) {
944 if (vp56_rac_get_prob_branchy(&s->c, 252))
945 s->prob.p.single_ref[i][0] =
946 update_prob(&s->c, s->prob.p.single_ref[i][0]);
947 if (vp56_rac_get_prob_branchy(&s->c, 252))
948 s->prob.p.single_ref[i][1] =
949 update_prob(&s->c, s->prob.p.single_ref[i][1]);
953 if (s->s.h.comppredmode != PRED_SINGLEREF) {
954 for (i = 0; i < 5; i++)
955 if (vp56_rac_get_prob_branchy(&s->c, 252))
956 s->prob.p.comp_ref[i] =
957 update_prob(&s->c, s->prob.p.comp_ref[i]);
960 for (i = 0; i < 4; i++)
961 for (j = 0; j < 9; j++)
962 if (vp56_rac_get_prob_branchy(&s->c, 252))
963 s->prob.p.y_mode[i][j] =
964 update_prob(&s->c, s->prob.p.y_mode[i][j]);
966 for (i = 0; i < 4; i++)
967 for (j = 0; j < 4; j++)
968 for (k = 0; k < 3; k++)
969 if (vp56_rac_get_prob_branchy(&s->c, 252))
970 s->prob.p.partition[3 - i][j][k] =
971 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
973 // mv fields don't use the update_prob subexp model for some reason
974 for (i = 0; i < 3; i++)
975 if (vp56_rac_get_prob_branchy(&s->c, 252))
976 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
978 for (i = 0; i < 2; i++) {
979 if (vp56_rac_get_prob_branchy(&s->c, 252))
980 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
982 for (j = 0; j < 10; j++)
983 if (vp56_rac_get_prob_branchy(&s->c, 252))
984 s->prob.p.mv_comp[i].classes[j] =
985 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
987 if (vp56_rac_get_prob_branchy(&s->c, 252))
988 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
990 for (j = 0; j < 10; j++)
991 if (vp56_rac_get_prob_branchy(&s->c, 252))
992 s->prob.p.mv_comp[i].bits[j] =
993 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
996 for (i = 0; i < 2; i++) {
997 for (j = 0; j < 2; j++)
998 for (k = 0; k < 3; k++)
999 if (vp56_rac_get_prob_branchy(&s->c, 252))
1000 s->prob.p.mv_comp[i].class0_fp[j][k] =
1001 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1003 for (j = 0; j < 3; j++)
1004 if (vp56_rac_get_prob_branchy(&s->c, 252))
1005 s->prob.p.mv_comp[i].fp[j] =
1006 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1009 if (s->s.h.highprecisionmvs) {
1010 for (i = 0; i < 2; i++) {
1011 if (vp56_rac_get_prob_branchy(&s->c, 252))
1012 s->prob.p.mv_comp[i].class0_hp =
1013 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1015 if (vp56_rac_get_prob_branchy(&s->c, 252))
1016 s->prob.p.mv_comp[i].hp =
1017 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1022 return (data2 - data) + size2;
1025 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1028 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1029 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1032 static void find_ref_mvs(VP9Context *s,
1033 VP56mv *pmv, int ref, int z, int idx, int sb)
1035 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1036 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1037 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1038 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1039 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1040 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1041 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1042 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1043 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1044 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1045 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1046 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1047 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1048 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1049 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1050 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1051 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1052 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1053 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1054 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1055 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1056 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1057 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1058 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1059 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1060 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1061 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1064 int row = s->row, col = s->col, row7 = s->row7;
1065 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1066 #define INVALID_MV 0x80008000U
1067 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1070 #define RETURN_DIRECT_MV(mv) \
1072 uint32_t m = AV_RN32A(&mv); \
1076 } else if (mem == INVALID_MV) { \
1078 } else if (m != mem) { \
1085 if (sb == 2 || sb == 1) {
1086 RETURN_DIRECT_MV(b->mv[0][z]);
1087 } else if (sb == 3) {
1088 RETURN_DIRECT_MV(b->mv[2][z]);
1089 RETURN_DIRECT_MV(b->mv[1][z]);
1090 RETURN_DIRECT_MV(b->mv[0][z]);
1093 #define RETURN_MV(mv) \
1098 av_assert2(idx == 1); \
1099 av_assert2(mem != INVALID_MV); \
1100 if (mem_sub8x8 == INVALID_MV) { \
1101 clamp_mv(&tmp, &mv, s); \
1102 m = AV_RN32A(&tmp); \
1107 mem_sub8x8 = AV_RN32A(&mv); \
1108 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1109 clamp_mv(&tmp, &mv, s); \
1110 m = AV_RN32A(&tmp); \
1114 /* BUG I'm pretty sure this isn't the intention */ \
1120 uint32_t m = AV_RN32A(&mv); \
1122 clamp_mv(pmv, &mv, s); \
1124 } else if (mem == INVALID_MV) { \
1126 } else if (m != mem) { \
1127 clamp_mv(pmv, &mv, s); \
1134 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1135 if (mv->ref[0] == ref) {
1136 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1137 } else if (mv->ref[1] == ref) {
1138 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1141 if (col > s->tile_col_start) {
1142 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1143 if (mv->ref[0] == ref) {
1144 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1145 } else if (mv->ref[1] == ref) {
1146 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1154 // previously coded MVs in this neighbourhood, using same reference frame
1155 for (; i < 8; i++) {
1156 int c = p[i][0] + col, r = p[i][1] + row;
1158 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1159 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1161 if (mv->ref[0] == ref) {
1162 RETURN_MV(mv->mv[0]);
1163 } else if (mv->ref[1] == ref) {
1164 RETURN_MV(mv->mv[1]);
1169 // MV at this position in previous frame, using same reference frame
1170 if (s->s.h.use_last_frame_mvs) {
1171 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1173 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1174 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1175 if (mv->ref[0] == ref) {
1176 RETURN_MV(mv->mv[0]);
1177 } else if (mv->ref[1] == ref) {
1178 RETURN_MV(mv->mv[1]);
1182 #define RETURN_SCALE_MV(mv, scale) \
1185 VP56mv mv_temp = { -mv.x, -mv.y }; \
1186 RETURN_MV(mv_temp); \
1192 // previously coded MVs in this neighbourhood, using different reference frame
1193 for (i = 0; i < 8; i++) {
1194 int c = p[i][0] + col, r = p[i][1] + row;
1196 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1197 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1199 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1200 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1202 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1203 // BUG - libvpx has this condition regardless of whether
1204 // we used the first ref MV and pre-scaling
1205 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1206 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1211 // MV at this position in previous frame, using different reference frame
1212 if (s->s.h.use_last_frame_mvs) {
1213 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1215 // no need to await_progress, because we already did that above
1216 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1217 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1219 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1220 // BUG - libvpx has this condition regardless of whether
1221 // we used the first ref MV and pre-scaling
1222 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1223 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1228 clamp_mv(pmv, pmv, s);
1231 #undef RETURN_SCALE_MV
1234 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1236 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1237 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1238 s->prob.p.mv_comp[idx].classes);
1240 s->counts.mv_comp[idx].sign[sign]++;
1241 s->counts.mv_comp[idx].classes[c]++;
1245 for (n = 0, m = 0; m < c; m++) {
1246 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1248 s->counts.mv_comp[idx].bits[m][bit]++;
1251 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1253 s->counts.mv_comp[idx].fp[bit]++;
1255 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1256 s->counts.mv_comp[idx].hp[bit]++;
1260 // bug in libvpx - we count for bw entropy purposes even if the
1262 s->counts.mv_comp[idx].hp[1]++;
1266 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1267 s->counts.mv_comp[idx].class0[n]++;
1268 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1269 s->prob.p.mv_comp[idx].class0_fp[n]);
1270 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1271 n = (n << 3) | (bit << 1);
1273 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1274 s->counts.mv_comp[idx].class0_hp[bit]++;
1278 // bug in libvpx - we count for bw entropy purposes even if the
1280 s->counts.mv_comp[idx].class0_hp[1]++;
1284 return sign ? -(n + 1) : (n + 1);
1287 static void fill_mv(VP9Context *s,
1288 VP56mv *mv, int mode, int sb)
1292 if (mode == ZEROMV) {
1297 // FIXME cache this value and reuse for other subblocks
1298 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1299 mode == NEWMV ? -1 : sb);
1300 // FIXME maybe move this code into find_ref_mvs()
1301 if ((mode == NEWMV || sb == -1) &&
1302 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1316 if (mode == NEWMV) {
1317 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1318 s->prob.p.mv_joint);
1320 s->counts.mv_joint[j]++;
1321 if (j >= MV_JOINT_V)
1322 mv[0].y += read_mv_component(s, 0, hp);
1324 mv[0].x += read_mv_component(s, 1, hp);
1328 // FIXME cache this value and reuse for other subblocks
1329 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1330 mode == NEWMV ? -1 : sb);
1331 if ((mode == NEWMV || sb == -1) &&
1332 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1346 if (mode == NEWMV) {
1347 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1348 s->prob.p.mv_joint);
1350 s->counts.mv_joint[j]++;
1351 if (j >= MV_JOINT_V)
1352 mv[1].y += read_mv_component(s, 0, hp);
1354 mv[1].x += read_mv_component(s, 1, hp);
1360 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1361 ptrdiff_t stride, int v)
1371 int v16 = v * 0x0101;
1379 uint32_t v32 = v * 0x01010101;
1388 uint64_t v64 = v * 0x0101010101010101ULL;
1394 uint32_t v32 = v * 0x01010101;
1397 AV_WN32A(ptr + 4, v32);
1406 static void decode_mode(AVCodecContext *ctx)
1408 static const uint8_t left_ctx[N_BS_SIZES] = {
1409 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1411 static const uint8_t above_ctx[N_BS_SIZES] = {
1412 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1414 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1415 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1416 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1418 VP9Context *s = ctx->priv_data;
1420 int row = s->row, col = s->col, row7 = s->row7;
1421 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1422 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1423 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1424 int have_a = row > 0, have_l = col > s->tile_col_start;
1425 int vref, filter_id;
1427 if (!s->s.h.segmentation.enabled) {
1429 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1430 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1431 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1432 } else if (!s->s.h.segmentation.update_map ||
1433 (s->s.h.segmentation.temporal &&
1434 vp56_rac_get_prob_branchy(&s->c,
1435 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1436 s->left_segpred_ctx[row7]]))) {
1437 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1439 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1441 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1442 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1443 for (y = 0; y < h4; y++) {
1444 int idx_base = (y + row) * 8 * s->sb_cols + col;
1445 for (x = 0; x < w4; x++)
1446 pred = FFMIN(pred, refsegmap[idx_base + x]);
1448 av_assert1(pred < 8);
1454 memset(&s->above_segpred_ctx[col], 1, w4);
1455 memset(&s->left_segpred_ctx[row7], 1, h4);
1457 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1458 s->s.h.segmentation.prob);
1460 memset(&s->above_segpred_ctx[col], 0, w4);
1461 memset(&s->left_segpred_ctx[row7], 0, h4);
1463 if (s->s.h.segmentation.enabled &&
1464 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1465 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1466 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1469 b->skip = s->s.h.segmentation.enabled &&
1470 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1472 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1473 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1474 s->counts.skip[c][b->skip]++;
1477 if (s->s.h.keyframe || s->s.h.intraonly) {
1479 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1480 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1484 if (have_a && have_l) {
1485 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1488 c = have_a ? 2 * s->above_intra_ctx[col] :
1489 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1491 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1492 s->counts.intra[c][bit]++;
1496 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1500 c = (s->above_skip_ctx[col] ? max_tx :
1501 s->above_txfm_ctx[col]) +
1502 (s->left_skip_ctx[row7] ? max_tx :
1503 s->left_txfm_ctx[row7]) > max_tx;
1505 c = s->above_skip_ctx[col] ? 1 :
1506 (s->above_txfm_ctx[col] * 2 > max_tx);
1508 } else if (have_l) {
1509 c = s->left_skip_ctx[row7] ? 1 :
1510 (s->left_txfm_ctx[row7] * 2 > max_tx);
1516 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1518 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1520 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1522 s->counts.tx32p[c][b->tx]++;
1525 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1527 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1528 s->counts.tx16p[c][b->tx]++;
1531 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1532 s->counts.tx8p[c][b->tx]++;
1539 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1542 if (s->s.h.keyframe || s->s.h.intraonly) {
1543 uint8_t *a = &s->above_mode_ctx[col * 2];
1544 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1547 if (b->bs > BS_8x8) {
1548 // FIXME the memory storage intermediates here aren't really
1549 // necessary, they're just there to make the code slightly
1551 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1552 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1553 if (b->bs != BS_8x4) {
1554 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1555 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1556 l[0] = a[1] = b->mode[1];
1558 l[0] = a[1] = b->mode[1] = b->mode[0];
1560 if (b->bs != BS_4x8) {
1561 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1562 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1563 if (b->bs != BS_8x4) {
1564 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1565 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1566 l[1] = a[1] = b->mode[3];
1568 l[1] = a[1] = b->mode[3] = b->mode[2];
1571 b->mode[2] = b->mode[0];
1572 l[1] = a[1] = b->mode[3] = b->mode[1];
1575 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1576 vp9_default_kf_ymode_probs[*a][*l]);
1577 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1578 // FIXME this can probably be optimized
1579 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1580 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1582 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1583 vp9_default_kf_uvmode_probs[b->mode[3]]);
1584 } else if (b->intra) {
1586 if (b->bs > BS_8x8) {
1587 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1588 s->prob.p.y_mode[0]);
1589 s->counts.y_mode[0][b->mode[0]]++;
1590 if (b->bs != BS_8x4) {
1591 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1592 s->prob.p.y_mode[0]);
1593 s->counts.y_mode[0][b->mode[1]]++;
1595 b->mode[1] = b->mode[0];
1597 if (b->bs != BS_4x8) {
1598 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1599 s->prob.p.y_mode[0]);
1600 s->counts.y_mode[0][b->mode[2]]++;
1601 if (b->bs != BS_8x4) {
1602 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1603 s->prob.p.y_mode[0]);
1604 s->counts.y_mode[0][b->mode[3]]++;
1606 b->mode[3] = b->mode[2];
1609 b->mode[2] = b->mode[0];
1610 b->mode[3] = b->mode[1];
1613 static const uint8_t size_group[10] = {
1614 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1616 int sz = size_group[b->bs];
1618 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1619 s->prob.p.y_mode[sz]);
1620 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1621 s->counts.y_mode[sz][b->mode[3]]++;
1623 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1624 s->prob.p.uv_mode[b->mode[3]]);
1625 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1627 static const uint8_t inter_mode_ctx_lut[14][14] = {
1628 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1629 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1630 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1631 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1632 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1633 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1634 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1635 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1636 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1637 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1638 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1639 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1640 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1641 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1644 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1645 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1647 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1649 // read comp_pred flag
1650 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1651 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1655 // FIXME add intra as ref=0xff (or -1) to make these easier?
1658 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1660 } else if (s->above_comp_ctx[col]) {
1661 c = 2 + (s->left_intra_ctx[row7] ||
1662 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1663 } else if (s->left_comp_ctx[row7]) {
1664 c = 2 + (s->above_intra_ctx[col] ||
1665 s->above_ref_ctx[col] == s->s.h.fixcompref);
1667 c = (!s->above_intra_ctx[col] &&
1668 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1669 (!s->left_intra_ctx[row7] &&
1670 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1673 c = s->above_comp_ctx[col] ? 3 :
1674 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1676 } else if (have_l) {
1677 c = s->left_comp_ctx[row7] ? 3 :
1678 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1682 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1683 s->counts.comp[c][b->comp]++;
1686 // read actual references
1687 // FIXME probably cache a few variables here to prevent repetitive
1688 // memory accesses below
1689 if (b->comp) /* two references */ {
1690 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1692 b->ref[fix_idx] = s->s.h.fixcompref;
1693 // FIXME can this codeblob be replaced by some sort of LUT?
1696 if (s->above_intra_ctx[col]) {
1697 if (s->left_intra_ctx[row7]) {
1700 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1702 } else if (s->left_intra_ctx[row7]) {
1703 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1705 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1707 if (refl == refa && refa == s->s.h.varcompref[1]) {
1709 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1710 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1711 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1714 c = (refa == refl) ? 3 : 1;
1716 } else if (!s->left_comp_ctx[row7]) {
1717 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1720 c = (refl == s->s.h.varcompref[1] &&
1721 refa != s->s.h.varcompref[1]) ? 2 : 4;
1723 } else if (!s->above_comp_ctx[col]) {
1724 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1727 c = (refa == s->s.h.varcompref[1] &&
1728 refl != s->s.h.varcompref[1]) ? 2 : 4;
1731 c = (refl == refa) ? 4 : 2;
1735 if (s->above_intra_ctx[col]) {
1737 } else if (s->above_comp_ctx[col]) {
1738 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1740 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1743 } else if (have_l) {
1744 if (s->left_intra_ctx[row7]) {
1746 } else if (s->left_comp_ctx[row7]) {
1747 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1749 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1754 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1755 b->ref[var_idx] = s->s.h.varcompref[bit];
1756 s->counts.comp_ref[c][bit]++;
1757 } else /* single reference */ {
1760 if (have_a && !s->above_intra_ctx[col]) {
1761 if (have_l && !s->left_intra_ctx[row7]) {
1762 if (s->left_comp_ctx[row7]) {
1763 if (s->above_comp_ctx[col]) {
1764 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1765 !s->above_ref_ctx[col]);
1767 c = (3 * !s->above_ref_ctx[col]) +
1768 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1770 } else if (s->above_comp_ctx[col]) {
1771 c = (3 * !s->left_ref_ctx[row7]) +
1772 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1774 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1776 } else if (s->above_intra_ctx[col]) {
1778 } else if (s->above_comp_ctx[col]) {
1779 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1781 c = 4 * (!s->above_ref_ctx[col]);
1783 } else if (have_l && !s->left_intra_ctx[row7]) {
1784 if (s->left_intra_ctx[row7]) {
1786 } else if (s->left_comp_ctx[row7]) {
1787 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1789 c = 4 * (!s->left_ref_ctx[row7]);
1794 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1795 s->counts.single_ref[c][0][bit]++;
1799 // FIXME can this codeblob be replaced by some sort of LUT?
1802 if (s->left_intra_ctx[row7]) {
1803 if (s->above_intra_ctx[col]) {
1805 } else if (s->above_comp_ctx[col]) {
1806 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1807 s->above_ref_ctx[col] == 1);
1808 } else if (!s->above_ref_ctx[col]) {
1811 c = 4 * (s->above_ref_ctx[col] == 1);
1813 } else if (s->above_intra_ctx[col]) {
1814 if (s->left_intra_ctx[row7]) {
1816 } else if (s->left_comp_ctx[row7]) {
1817 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1818 s->left_ref_ctx[row7] == 1);
1819 } else if (!s->left_ref_ctx[row7]) {
1822 c = 4 * (s->left_ref_ctx[row7] == 1);
1824 } else if (s->above_comp_ctx[col]) {
1825 if (s->left_comp_ctx[row7]) {
1826 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1827 c = 3 * (s->s.h.fixcompref == 1 ||
1828 s->left_ref_ctx[row7] == 1);
1832 } else if (!s->left_ref_ctx[row7]) {
1833 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1834 s->above_ref_ctx[col] == 1);
1836 c = 3 * (s->left_ref_ctx[row7] == 1) +
1837 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1839 } else if (s->left_comp_ctx[row7]) {
1840 if (!s->above_ref_ctx[col]) {
1841 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1842 s->left_ref_ctx[row7] == 1);
1844 c = 3 * (s->above_ref_ctx[col] == 1) +
1845 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1847 } else if (!s->above_ref_ctx[col]) {
1848 if (!s->left_ref_ctx[row7]) {
1851 c = 4 * (s->left_ref_ctx[row7] == 1);
1853 } else if (!s->left_ref_ctx[row7]) {
1854 c = 4 * (s->above_ref_ctx[col] == 1);
1856 c = 2 * (s->left_ref_ctx[row7] == 1) +
1857 2 * (s->above_ref_ctx[col] == 1);
1860 if (s->above_intra_ctx[col] ||
1861 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1863 } else if (s->above_comp_ctx[col]) {
1864 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1866 c = 4 * (s->above_ref_ctx[col] == 1);
1869 } else if (have_l) {
1870 if (s->left_intra_ctx[row7] ||
1871 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1873 } else if (s->left_comp_ctx[row7]) {
1874 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1876 c = 4 * (s->left_ref_ctx[row7] == 1);
1881 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1882 s->counts.single_ref[c][1][bit]++;
1883 b->ref[0] = 1 + bit;
1888 if (b->bs <= BS_8x8) {
1889 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1890 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1892 static const uint8_t off[10] = {
1893 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1896 // FIXME this needs to use the LUT tables from find_ref_mvs
1897 // because not all are -1,0/0,-1
1898 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1899 [s->left_mode_ctx[row7 + off[b->bs]]];
1901 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1902 s->prob.p.mv_mode[c]);
1903 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1904 s->counts.mv_mode[c][b->mode[0] - 10]++;
1908 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1911 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1912 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1913 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1914 s->left_filter_ctx[row7] : 3;
1916 c = s->above_filter_ctx[col];
1918 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1919 c = s->left_filter_ctx[row7];
1924 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1925 s->prob.p.filter[c]);
1926 s->counts.filter[c][filter_id]++;
1927 b->filter = vp9_filter_lut[filter_id];
1929 b->filter = s->s.h.filtermode;
1932 if (b->bs > BS_8x8) {
1933 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1935 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1936 s->prob.p.mv_mode[c]);
1937 s->counts.mv_mode[c][b->mode[0] - 10]++;
1938 fill_mv(s, b->mv[0], b->mode[0], 0);
1940 if (b->bs != BS_8x4) {
1941 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1942 s->prob.p.mv_mode[c]);
1943 s->counts.mv_mode[c][b->mode[1] - 10]++;
1944 fill_mv(s, b->mv[1], b->mode[1], 1);
1946 b->mode[1] = b->mode[0];
1947 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1948 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1951 if (b->bs != BS_4x8) {
1952 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1953 s->prob.p.mv_mode[c]);
1954 s->counts.mv_mode[c][b->mode[2] - 10]++;
1955 fill_mv(s, b->mv[2], b->mode[2], 2);
1957 if (b->bs != BS_8x4) {
1958 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1959 s->prob.p.mv_mode[c]);
1960 s->counts.mv_mode[c][b->mode[3] - 10]++;
1961 fill_mv(s, b->mv[3], b->mode[3], 3);
1963 b->mode[3] = b->mode[2];
1964 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1965 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1968 b->mode[2] = b->mode[0];
1969 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1970 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1971 b->mode[3] = b->mode[1];
1972 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1973 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1976 fill_mv(s, b->mv[0], b->mode[0], -1);
1977 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1978 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1979 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1980 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1981 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1982 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1985 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
1989 #define SPLAT_CTX(var, val, n) \
1991 case 1: var = val; break; \
1992 case 2: AV_WN16A(&var, val * 0x0101); break; \
1993 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1994 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1996 uint64_t v64 = val * 0x0101010101010101ULL; \
1997 AV_WN64A( &var, v64); \
1998 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2003 #define SPLAT_CTX(var, val, n) \
2005 case 1: var = val; break; \
2006 case 2: AV_WN16A(&var, val * 0x0101); break; \
2007 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2009 uint32_t v32 = val * 0x01010101; \
2010 AV_WN32A( &var, v32); \
2011 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2015 uint32_t v32 = val * 0x01010101; \
2016 AV_WN32A( &var, v32); \
2017 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2018 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2019 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2025 switch (bwh_tab[1][b->bs][0]) {
2026 #define SET_CTXS(dir, off, n) \
2028 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2029 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2030 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2031 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2032 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2033 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2034 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2036 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2037 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2038 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2043 case 1: SET_CTXS(above, col, 1); break;
2044 case 2: SET_CTXS(above, col, 2); break;
2045 case 4: SET_CTXS(above, col, 4); break;
2046 case 8: SET_CTXS(above, col, 8); break;
2048 switch (bwh_tab[1][b->bs][1]) {
2049 case 1: SET_CTXS(left, row7, 1); break;
2050 case 2: SET_CTXS(left, row7, 2); break;
2051 case 4: SET_CTXS(left, row7, 4); break;
2052 case 8: SET_CTXS(left, row7, 8); break;
2057 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2058 if (b->bs > BS_8x8) {
2059 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2061 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2062 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2063 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2064 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2065 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2066 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2067 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2068 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2070 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2072 for (n = 0; n < w4 * 2; n++) {
2073 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2074 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2076 for (n = 0; n < h4 * 2; n++) {
2077 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2078 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2084 for (y = 0; y < h4; y++) {
2085 int x, o = (row + y) * s->sb_cols * 8 + col;
2086 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2089 for (x = 0; x < w4; x++) {
2093 } else if (b->comp) {
2094 for (x = 0; x < w4; x++) {
2095 mv[x].ref[0] = b->ref[0];
2096 mv[x].ref[1] = b->ref[1];
2097 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2098 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2101 for (x = 0; x < w4; x++) {
2102 mv[x].ref[0] = b->ref[0];
2104 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2110 // FIXME merge cnt/eob arguments?
2111 static av_always_inline int
2112 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2113 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2114 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2115 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2116 const int16_t *band_counts, const int16_t *qmul)
2118 int i = 0, band = 0, band_left = band_counts[band];
2119 uint8_t *tp = p[0][nnz];
2120 uint8_t cache[1024];
2125 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2126 eob[band][nnz][val]++;
2131 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2132 cnt[band][nnz][0]++;
2134 band_left = band_counts[++band];
2136 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2138 if (++i == n_coeffs)
2139 break; //invalid input; blocks should end with EOB
2144 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2145 cnt[band][nnz][1]++;
2149 // fill in p[3-10] (model fill) - only once per frame for each pos
2151 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2153 cnt[band][nnz][2]++;
2154 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2155 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2156 cache[rc] = val = 2;
2158 val = 3 + vp56_rac_get_prob(c, tp[5]);
2161 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2163 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2164 val = 5 + vp56_rac_get_prob(c, 159);
2166 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2167 val += vp56_rac_get_prob(c, 145);
2171 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2172 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2173 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2174 val += (vp56_rac_get_prob(c, 148) << 1);
2175 val += vp56_rac_get_prob(c, 140);
2177 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2178 val += (vp56_rac_get_prob(c, 155) << 2);
2179 val += (vp56_rac_get_prob(c, 140) << 1);
2180 val += vp56_rac_get_prob(c, 135);
2182 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2183 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2184 val += (vp56_rac_get_prob(c, 157) << 3);
2185 val += (vp56_rac_get_prob(c, 141) << 2);
2186 val += (vp56_rac_get_prob(c, 134) << 1);
2187 val += vp56_rac_get_prob(c, 130);
2190 if (!is8bitsperpixel) {
2192 val += vp56_rac_get_prob(c, 255) << 17;
2193 val += vp56_rac_get_prob(c, 255) << 16;
2195 val += (vp56_rac_get_prob(c, 255) << 15);
2196 val += (vp56_rac_get_prob(c, 255) << 14);
2198 val += (vp56_rac_get_prob(c, 254) << 13);
2199 val += (vp56_rac_get_prob(c, 254) << 12);
2200 val += (vp56_rac_get_prob(c, 254) << 11);
2201 val += (vp56_rac_get_prob(c, 252) << 10);
2202 val += (vp56_rac_get_prob(c, 249) << 9);
2203 val += (vp56_rac_get_prob(c, 243) << 8);
2204 val += (vp56_rac_get_prob(c, 230) << 7);
2205 val += (vp56_rac_get_prob(c, 196) << 6);
2206 val += (vp56_rac_get_prob(c, 177) << 5);
2207 val += (vp56_rac_get_prob(c, 153) << 4);
2208 val += (vp56_rac_get_prob(c, 140) << 3);
2209 val += (vp56_rac_get_prob(c, 133) << 2);
2210 val += (vp56_rac_get_prob(c, 130) << 1);
2211 val += vp56_rac_get_prob(c, 129);
2215 #define STORE_COEF(c, i, v) do { \
2216 if (is8bitsperpixel) { \
2219 AV_WN32A(&c[i * 2], v); \
2223 band_left = band_counts[++band];
2225 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2227 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2228 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2230 } while (++i < n_coeffs);
2235 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2236 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2237 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2238 const int16_t (*nb)[2], const int16_t *band_counts,
2239 const int16_t *qmul)
2241 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2242 nnz, scan, nb, band_counts, qmul);
2245 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2246 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2247 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2248 const int16_t (*nb)[2], const int16_t *band_counts,
2249 const int16_t *qmul)
2251 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2252 nnz, scan, nb, band_counts, qmul);
2255 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2256 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2257 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2258 const int16_t (*nb)[2], const int16_t *band_counts,
2259 const int16_t *qmul)
2261 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2262 nnz, scan, nb, band_counts, qmul);
2265 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2266 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2267 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2268 const int16_t (*nb)[2], const int16_t *band_counts,
2269 const int16_t *qmul)
2271 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2272 nnz, scan, nb, band_counts, qmul);
2275 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2277 VP9Context *s = ctx->priv_data;
2279 int row = s->row, col = s->col;
2280 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2281 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2282 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2283 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2284 int end_x = FFMIN(2 * (s->cols - col), w4);
2285 int end_y = FFMIN(2 * (s->rows - row), h4);
2286 int n, pl, x, y, res;
2287 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2288 int tx = 4 * s->s.h.lossless + b->tx;
2289 const int16_t * const *yscans = vp9_scans[tx];
2290 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2291 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2292 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2293 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2294 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2295 static const int16_t band_counts[4][8] = {
2296 { 1, 2, 3, 4, 3, 16 - 13 },
2297 { 1, 2, 3, 4, 11, 64 - 21 },
2298 { 1, 2, 3, 4, 11, 256 - 21 },
2299 { 1, 2, 3, 4, 11, 1024 - 21 },
2301 const int16_t *y_band_counts = band_counts[b->tx];
2302 const int16_t *uv_band_counts = band_counts[b->uvtx];
2303 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2304 int total_coeff = 0;
2306 #define MERGE(la, end, step, rd) \
2307 for (n = 0; n < end; n += step) \
2308 la[n] = !!rd(&la[n])
2309 #define MERGE_CTX(step, rd) \
2311 MERGE(l, end_y, step, rd); \
2312 MERGE(a, end_x, step, rd); \
2315 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2316 for (n = 0, y = 0; y < end_y; y += step) { \
2317 for (x = 0; x < end_x; x += step, n += step * step) { \
2318 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2319 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2320 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2321 c, e, p, a[x] + l[y], yscans[txtp], \
2322 ynbs[txtp], y_band_counts, qmul[0]); \
2323 a[x] = l[y] = !!res; \
2324 total_coeff |= !!res; \
2326 AV_WN16A(&s->eob[n], res); \
2333 #define SPLAT(la, end, step, cond) \
2335 for (n = 1; n < end; n += step) \
2336 la[n] = la[n - 1]; \
2337 } else if (step == 4) { \
2339 for (n = 0; n < end; n += step) \
2340 AV_WN32A(&la[n], la[n] * 0x01010101); \
2342 for (n = 0; n < end; n += step) \
2343 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2345 } else /* step == 8 */ { \
2347 if (HAVE_FAST_64BIT) { \
2348 for (n = 0; n < end; n += step) \
2349 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2351 for (n = 0; n < end; n += step) { \
2352 uint32_t v32 = la[n] * 0x01010101; \
2353 AV_WN32A(&la[n], v32); \
2354 AV_WN32A(&la[n + 4], v32); \
2358 for (n = 0; n < end; n += step) \
2359 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2362 #define SPLAT_CTX(step) \
2364 SPLAT(a, end_x, step, end_x == w4); \
2365 SPLAT(l, end_y, step, end_y == h4); \
2371 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2374 MERGE_CTX(2, AV_RN16A);
2375 DECODE_Y_COEF_LOOP(2, 0,);
2379 MERGE_CTX(4, AV_RN32A);
2380 DECODE_Y_COEF_LOOP(4, 0,);
2384 MERGE_CTX(8, AV_RN64A);
2385 DECODE_Y_COEF_LOOP(8, 0, 32);
2390 #define DECODE_UV_COEF_LOOP(step, v) \
2391 for (n = 0, y = 0; y < end_y; y += step) { \
2392 for (x = 0; x < end_x; x += step, n += step * step) { \
2393 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2394 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2395 16 * step * step, c, e, p, a[x] + l[y], \
2396 uvscan, uvnb, uv_band_counts, qmul[1]); \
2397 a[x] = l[y] = !!res; \
2398 total_coeff |= !!res; \
2400 AV_WN16A(&s->uveob[pl][n], res); \
2402 s->uveob[pl][n] = res; \
2407 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2408 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2409 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2414 for (pl = 0; pl < 2; pl++) {
2415 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2416 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2419 DECODE_UV_COEF_LOOP(1,);
2422 MERGE_CTX(2, AV_RN16A);
2423 DECODE_UV_COEF_LOOP(2,);
2427 MERGE_CTX(4, AV_RN32A);
2428 DECODE_UV_COEF_LOOP(4,);
2432 MERGE_CTX(8, AV_RN64A);
2433 DECODE_UV_COEF_LOOP(8, 32);
2442 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2444 return decode_coeffs(ctx, 1);
2447 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2449 return decode_coeffs(ctx, 0);
2452 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2453 uint8_t *dst_edge, ptrdiff_t stride_edge,
2454 uint8_t *dst_inner, ptrdiff_t stride_inner,
2455 uint8_t *l, int col, int x, int w,
2456 int row, int y, enum TxfmMode tx,
2457 int p, int ss_h, int ss_v, int bytesperpixel)
2459 int have_top = row > 0 || y > 0;
2460 int have_left = col > s->tile_col_start || x > 0;
2461 int have_right = x < w - 1;
2463 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2464 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2465 { DC_127_PRED, VERT_PRED } },
2466 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2467 { HOR_PRED, HOR_PRED } },
2468 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2469 { LEFT_DC_PRED, DC_PRED } },
2470 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2471 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2472 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2473 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2474 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2475 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2476 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2477 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2478 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2479 { DC_127_PRED, VERT_LEFT_PRED } },
2480 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2481 { HOR_UP_PRED, HOR_UP_PRED } },
2482 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2483 { HOR_PRED, TM_VP8_PRED } },
2485 static const struct {
2486 uint8_t needs_left:1;
2487 uint8_t needs_top:1;
2488 uint8_t needs_topleft:1;
2489 uint8_t needs_topright:1;
2490 uint8_t invert_left:1;
2491 } edges[N_INTRA_PRED_MODES] = {
2492 [VERT_PRED] = { .needs_top = 1 },
2493 [HOR_PRED] = { .needs_left = 1 },
2494 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2495 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2496 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2497 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2498 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2499 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2500 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2501 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2502 [LEFT_DC_PRED] = { .needs_left = 1 },
2503 [TOP_DC_PRED] = { .needs_top = 1 },
2504 [DC_128_PRED] = { 0 },
2505 [DC_127_PRED] = { 0 },
2506 [DC_129_PRED] = { 0 }
2509 av_assert2(mode >= 0 && mode < 10);
2510 mode = mode_conv[mode][have_left][have_top];
2511 if (edges[mode].needs_top) {
2512 uint8_t *top, *topleft;
2513 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2514 int n_px_need_tr = 0;
2516 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2519 // if top of sb64-row, use s->intra_pred_data[] instead of
2520 // dst[-stride] for intra prediction (it contains pre- instead of
2521 // post-loopfilter data)
2523 top = !(row & 7) && !y ?
2524 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2525 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2527 topleft = !(row & 7) && !y ?
2528 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2529 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2530 &dst_inner[-stride_inner];
2534 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2535 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2536 n_px_need + n_px_need_tr <= n_px_have) {
2540 if (n_px_need <= n_px_have) {
2541 memcpy(*a, top, n_px_need * bytesperpixel);
2543 #define memset_bpp(c, i1, v, i2, num) do { \
2544 if (bytesperpixel == 1) { \
2545 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2547 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2548 for (n = 0; n < (num); n++) { \
2549 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2553 memcpy(*a, top, n_px_have * bytesperpixel);
2554 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2557 #define memset_val(c, val, num) do { \
2558 if (bytesperpixel == 1) { \
2559 memset((c), (val), (num)); \
2562 for (n = 0; n < (num); n++) { \
2563 AV_WN16A(&(c)[n * 2], (val)); \
2567 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2569 if (edges[mode].needs_topleft) {
2570 if (have_left && have_top) {
2571 #define assign_bpp(c, i1, v, i2) do { \
2572 if (bytesperpixel == 1) { \
2573 (c)[(i1)] = (v)[(i2)]; \
2575 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2578 assign_bpp(*a, -1, topleft, -1);
2580 #define assign_val(c, i, v) do { \
2581 if (bytesperpixel == 1) { \
2584 AV_WN16A(&(c)[(i) * 2], (v)); \
2587 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2590 if (tx == TX_4X4 && edges[mode].needs_topright) {
2591 if (have_top && have_right &&
2592 n_px_need + n_px_need_tr <= n_px_have) {
2593 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2595 memset_bpp(*a, 4, *a, 3, 4);
2600 if (edges[mode].needs_left) {
2602 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2603 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2604 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2606 if (edges[mode].invert_left) {
2607 if (n_px_need <= n_px_have) {
2608 for (i = 0; i < n_px_need; i++)
2609 assign_bpp(l, i, &dst[i * stride], -1);
2611 for (i = 0; i < n_px_have; i++)
2612 assign_bpp(l, i, &dst[i * stride], -1);
2613 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2616 if (n_px_need <= n_px_have) {
2617 for (i = 0; i < n_px_need; i++)
2618 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2620 for (i = 0; i < n_px_have; i++)
2621 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2622 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2626 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2633 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2634 ptrdiff_t uv_off, int bytesperpixel)
2636 VP9Context *s = ctx->priv_data;
2638 int row = s->row, col = s->col;
2639 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2640 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2641 int end_x = FFMIN(2 * (s->cols - col), w4);
2642 int end_y = FFMIN(2 * (s->rows - row), h4);
2643 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2644 int uvstep1d = 1 << b->uvtx, p;
2645 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2646 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2647 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2649 for (n = 0, y = 0; y < end_y; y += step1d) {
2650 uint8_t *ptr = dst, *ptr_r = dst_r;
2651 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2652 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2653 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2655 uint8_t *a = &a_buf[32];
2656 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2657 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2659 mode = check_intra_mode(s, mode, &a, ptr_r,
2660 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2661 ptr, s->y_stride, l,
2662 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2663 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2665 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2666 s->block + 16 * n * bytesperpixel, eob);
2668 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2669 dst += 4 * step1d * s->y_stride;
2676 step = 1 << (b->uvtx * 2);
2677 for (p = 0; p < 2; p++) {
2678 dst = s->dst[1 + p];
2679 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2680 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2681 uint8_t *ptr = dst, *ptr_r = dst_r;
2682 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2683 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2684 int mode = b->uvmode;
2685 uint8_t *a = &a_buf[32];
2686 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2688 mode = check_intra_mode(s, mode, &a, ptr_r,
2689 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2690 ptr, s->uv_stride, l, col, x, w4, row, y,
2691 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2692 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2694 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2695 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2697 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2698 dst += 4 * uvstep1d * s->uv_stride;
2703 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2705 intra_recon(ctx, y_off, uv_off, 1);
2708 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2710 intra_recon(ctx, y_off, uv_off, 2);
2713 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2714 uint8_t *dst, ptrdiff_t dst_stride,
2715 const uint8_t *ref, ptrdiff_t ref_stride,
2716 ThreadFrame *ref_frame,
2717 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2718 int bw, int bh, int w, int h, int bytesperpixel)
2720 int mx = mv->x, my = mv->y, th;
2724 ref += y * ref_stride + x * bytesperpixel;
2727 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2728 // we use +7 because the last 7 pixels of each sbrow can be changed in
2729 // the longest loopfilter of the next sbrow
2730 th = (y + bh + 4 * !!my + 7) >> 6;
2731 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2732 if (x < !!mx * 3 || y < !!my * 3 ||
2733 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2734 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2735 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2737 bw + !!mx * 7, bh + !!my * 7,
2738 x - !!mx * 3, y - !!my * 3, w, h);
2739 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2742 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2745 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2746 uint8_t *dst_u, uint8_t *dst_v,
2747 ptrdiff_t dst_stride,
2748 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2749 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2750 ThreadFrame *ref_frame,
2751 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2752 int bw, int bh, int w, int h, int bytesperpixel)
2754 int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2758 ref_u += y * src_stride_u + x * bytesperpixel;
2759 ref_v += y * src_stride_v + x * bytesperpixel;
2762 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2763 // we use +7 because the last 7 pixels of each sbrow can be changed in
2764 // the longest loopfilter of the next sbrow
2765 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2766 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2767 if (x < !!mx * 3 || y < !!my * 3 ||
2768 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2769 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2770 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2772 bw + !!mx * 7, bh + !!my * 7,
2773 x - !!mx * 3, y - !!my * 3, w, h);
2774 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2775 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2777 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2778 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2780 bw + !!mx * 7, bh + !!my * 7,
2781 x - !!mx * 3, y - !!my * 3, w, h);
2782 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2783 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2785 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2786 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2790 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2791 px, py, pw, ph, bw, bh, w, h, i) \
2792 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2793 mv, bw, bh, w, h, bytesperpixel)
2794 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2795 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2796 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2797 row, col, mv, bw, bh, w, h, bytesperpixel)
2799 #define FN(x) x##_8bpp
2800 #define BYTES_PER_PIXEL 1
2801 #include "vp9_mc_template.c"
2803 #undef BYTES_PER_PIXEL
2804 #define FN(x) x##_16bpp
2805 #define BYTES_PER_PIXEL 2
2806 #include "vp9_mc_template.c"
2808 #undef mc_chroma_dir
2810 #undef BYTES_PER_PIXEL
2813 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2814 vp9_mc_func (*mc)[2],
2815 uint8_t *dst, ptrdiff_t dst_stride,
2816 const uint8_t *ref, ptrdiff_t ref_stride,
2817 ThreadFrame *ref_frame,
2818 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2819 int px, int py, int pw, int ph,
2820 int bw, int bh, int w, int h, int bytesperpixel,
2821 const uint16_t *scale, const uint8_t *step)
2823 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2824 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2825 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2826 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2828 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2830 int refbw_m1, refbh_m1;
2834 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2835 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2836 // BUG libvpx seems to scale the two components separately. This introduces
2837 // rounding errors but we have to reproduce them to be exactly compatible
2838 // with the output from libvpx...
2839 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2840 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2844 ref += y * ref_stride + x * bytesperpixel;
2847 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2848 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2849 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2850 // we use +7 because the last 7 pixels of each sbrow can be changed in
2851 // the longest loopfilter of the next sbrow
2852 th = (y + refbh_m1 + 4 + 7) >> 6;
2853 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2854 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2855 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2856 ref - 3 * ref_stride - 3 * bytesperpixel,
2858 refbw_m1 + 8, refbh_m1 + 8,
2859 x - 3, y - 3, w, h);
2860 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2863 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2867 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2868 vp9_mc_func (*mc)[2],
2869 uint8_t *dst_u, uint8_t *dst_v,
2870 ptrdiff_t dst_stride,
2871 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2872 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2873 ThreadFrame *ref_frame,
2874 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2875 int px, int py, int pw, int ph,
2876 int bw, int bh, int w, int h, int bytesperpixel,
2877 const uint16_t *scale, const uint8_t *step)
2879 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2880 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2881 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2882 ref_v, src_stride_v, ref_frame,
2883 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2886 int refbw_m1, refbh_m1;
2891 // BUG https://code.google.com/p/webm/issues/detail?id=820
2892 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2893 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2895 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2896 mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2899 // BUG https://code.google.com/p/webm/issues/detail?id=820
2900 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2901 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2903 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2904 my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2909 ref_u += y * src_stride_u + x * bytesperpixel;
2910 ref_v += y * src_stride_v + x * bytesperpixel;
2913 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2914 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2915 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2916 // we use +7 because the last 7 pixels of each sbrow can be changed in
2917 // the longest loopfilter of the next sbrow
2918 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2919 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2920 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2921 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2922 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2924 refbw_m1 + 8, refbh_m1 + 8,
2925 x - 3, y - 3, w, h);
2926 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2927 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2929 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2930 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2932 refbw_m1 + 8, refbh_m1 + 8,
2933 x - 3, y - 3, w, h);
2934 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2935 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2937 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2938 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2943 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2944 px, py, pw, ph, bw, bh, w, h, i) \
2945 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2946 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2947 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2948 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2949 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2950 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2951 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2952 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2954 #define FN(x) x##_scaled_8bpp
2955 #define BYTES_PER_PIXEL 1
2956 #include "vp9_mc_template.c"
2958 #undef BYTES_PER_PIXEL
2959 #define FN(x) x##_scaled_16bpp
2960 #define BYTES_PER_PIXEL 2
2961 #include "vp9_mc_template.c"
2963 #undef mc_chroma_dir
2965 #undef BYTES_PER_PIXEL
2968 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2970 VP9Context *s = ctx->priv_data;
2972 int row = s->row, col = s->col;
2974 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2975 if (bytesperpixel == 1) {
2976 inter_pred_scaled_8bpp(ctx);
2978 inter_pred_scaled_16bpp(ctx);
2981 if (bytesperpixel == 1) {
2982 inter_pred_8bpp(ctx);
2984 inter_pred_16bpp(ctx);
2988 /* mostly copied intra_recon() */
2990 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2991 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2992 int end_x = FFMIN(2 * (s->cols - col), w4);
2993 int end_y = FFMIN(2 * (s->rows - row), h4);
2994 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2995 int uvstep1d = 1 << b->uvtx, p;
2996 uint8_t *dst = s->dst[0];
2999 for (n = 0, y = 0; y < end_y; y += step1d) {
3001 for (x = 0; x < end_x; x += step1d,
3002 ptr += 4 * step1d * bytesperpixel, n += step) {
3003 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3006 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3007 s->block + 16 * n * bytesperpixel, eob);
3009 dst += 4 * s->y_stride * step1d;
3015 step = 1 << (b->uvtx * 2);
3016 for (p = 0; p < 2; p++) {
3017 dst = s->dst[p + 1];
3018 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3020 for (x = 0; x < end_x; x += uvstep1d,
3021 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3022 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3025 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3026 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3028 dst += 4 * uvstep1d * s->uv_stride;
3034 static void inter_recon_8bpp(AVCodecContext *ctx)
3036 inter_recon(ctx, 1);
3039 static void inter_recon_16bpp(AVCodecContext *ctx)
3041 inter_recon(ctx, 2);
3044 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3045 int row_and_7, int col_and_7,
3046 int w, int h, int col_end, int row_end,
3047 enum TxfmMode tx, int skip_inter)
3049 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3050 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3052 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3053 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3054 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3055 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3057 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3058 // edges. This means that for UV, we work on two subsampled blocks at
3059 // a time, and we only use the topleft block's mode information to set
3060 // things like block strength. Thus, for any block size smaller than
3061 // 16x16, ignore the odd portion of the block.
3062 if (tx == TX_4X4 && (ss_v | ss_h)) {
3077 if (tx == TX_4X4 && !skip_inter) {
3078 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3079 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3080 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3082 for (y = row_and_7; y < h + row_and_7; y++) {
3083 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3085 mask[0][y][1] |= m_row_8;
3086 mask[0][y][2] |= m_row_4;
3087 // for odd lines, if the odd col is not being filtered,
3088 // skip odd row also:
3095 // if a/c are even row/col and b/d are odd, and d is skipped,
3096 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3097 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3098 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3100 mask[1][y][col_mask_id] |= m_col;
3103 mask[0][y][3] |= m_col;
3105 if (ss_h && (col_end & 1))
3106 mask[1][y][3] |= (t << (w - 1)) - t;
3108 mask[1][y][3] |= m_col;
3112 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3115 int mask_id = (tx == TX_8X8);
3116 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3117 int l2 = tx + ss_h - 1, step1d;
3118 int m_row = m_col & masks[l2];
3120 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3121 // 8wd loopfilter to prevent going off the visible edge.
3122 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3123 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3124 int m_row_8 = m_row - m_row_16;
3126 for (y = row_and_7; y < h + row_and_7; y++) {
3127 mask[0][y][0] |= m_row_16;
3128 mask[0][y][1] |= m_row_8;
3131 for (y = row_and_7; y < h + row_and_7; y++)
3132 mask[0][y][mask_id] |= m_row;
3137 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3138 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3139 mask[1][y][0] |= m_col;
3140 if (y - row_and_7 == h - 1)
3141 mask[1][y][1] |= m_col;
3143 for (y = row_and_7; y < h + row_and_7; y += step1d)
3144 mask[1][y][mask_id] |= m_col;
3146 } else if (tx != TX_4X4) {
3149 mask_id = (tx == TX_8X8) || (h == ss_v);
3150 mask[1][row_and_7][mask_id] |= m_col;
3151 mask_id = (tx == TX_8X8) || (w == ss_h);
3152 for (y = row_and_7; y < h + row_and_7; y++)
3153 mask[0][y][mask_id] |= t;
3155 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3157 for (y = row_and_7; y < h + row_and_7; y++) {
3158 mask[0][y][2] |= t4;
3159 mask[0][y][1] |= t8;
3161 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3166 static void decode_b(AVCodecContext *ctx, int row, int col,
3167 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3168 enum BlockLevel bl, enum BlockPartition bp)
3170 VP9Context *s = ctx->priv_data;
3172 enum BlockSize bs = bl * 3 + bp;
3173 int bytesperpixel = s->bytesperpixel;
3174 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3176 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3182 s->min_mv.x = -(128 + col * 64);
3183 s->min_mv.y = -(128 + row * 64);
3184 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3185 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3191 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3192 (s->ss_v && h4 * 2 == (1 << b->tx)));
3197 if (bytesperpixel == 1) {
3198 has_coeffs = decode_coeffs_8bpp(ctx);
3200 has_coeffs = decode_coeffs_16bpp(ctx);
3202 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3204 memset(&s->above_skip_ctx[col], 1, w4);
3205 memset(&s->left_skip_ctx[s->row7], 1, h4);
3210 #define SPLAT_ZERO_CTX(v, n) \
3212 case 1: v = 0; break; \
3213 case 2: AV_ZERO16(&v); break; \
3214 case 4: AV_ZERO32(&v); break; \
3215 case 8: AV_ZERO64(&v); break; \
3216 case 16: AV_ZERO128(&v); break; \
3218 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3220 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3221 if (s->ss_##dir2) { \
3222 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3223 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3225 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3226 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3231 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3232 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3233 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3234 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3237 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3238 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3239 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3240 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3246 s->block += w4 * h4 * 64 * bytesperpixel;
3247 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3248 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3249 s->eob += 4 * w4 * h4;
3250 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3251 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3257 // emulated overhangs if the stride of the target buffer can't hold. This
3258 // makes it possible to support emu-edge and so on even if we have large block
3260 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3261 (row + h4) > s->rows;
3262 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3263 (row + h4) > s->rows;
3265 s->dst[0] = s->tmp_y;
3268 s->dst[0] = f->data[0] + yoff;
3269 s->y_stride = f->linesize[0];
3272 s->dst[1] = s->tmp_uv[0];
3273 s->dst[2] = s->tmp_uv[1];
3276 s->dst[1] = f->data[1] + uvoff;
3277 s->dst[2] = f->data[2] + uvoff;
3278 s->uv_stride = f->linesize[1];
3282 intra_recon_16bpp(ctx, yoff, uvoff);
3284 intra_recon_8bpp(ctx, yoff, uvoff);
3288 inter_recon_16bpp(ctx);
3290 inter_recon_8bpp(ctx);
3294 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3296 for (n = 0; o < w; n++) {
3301 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3302 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3308 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3309 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3311 for (n = s->ss_h; o < w; n++) {
3316 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3317 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3318 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3319 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3325 // pick filter level and find edges to apply filter to
3326 if (s->s.h.filter.level &&
3327 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3328 [b->mode[3] != ZEROMV]) > 0) {
3329 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3330 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3332 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3333 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3334 if (s->ss_h || s->ss_v)
3335 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3336 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3337 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3338 b->uvtx, skip_inter);
3340 if (!s->filter_lut.lim_lut[lvl]) {
3341 int sharp = s->s.h.filter.sharpness;
3345 limit >>= (sharp + 3) >> 2;
3346 limit = FFMIN(limit, 9 - sharp);
3348 limit = FFMAX(limit, 1);
3350 s->filter_lut.lim_lut[lvl] = limit;
3351 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3357 s->block += w4 * h4 * 64 * bytesperpixel;
3358 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3359 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3360 s->eob += 4 * w4 * h4;
3361 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3362 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3366 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3367 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3369 VP9Context *s = ctx->priv_data;
3370 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3371 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3372 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3373 s->prob.p.partition[bl][c];
3374 enum BlockPartition bp;
3375 ptrdiff_t hbs = 4 >> bl;
3376 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3377 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3378 int bytesperpixel = s->bytesperpixel;
3381 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3382 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3383 } else if (col + hbs < s->cols) { // FIXME why not <=?
3384 if (row + hbs < s->rows) { // FIXME why not <=?
3385 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3387 case PARTITION_NONE:
3388 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3391 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3392 yoff += hbs * 8 * y_stride;
3393 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3394 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3397 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3398 yoff += hbs * 8 * bytesperpixel;
3399 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3400 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3402 case PARTITION_SPLIT:
3403 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3404 decode_sb(ctx, row, col + hbs, lflvl,
3405 yoff + 8 * hbs * bytesperpixel,
3406 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3407 yoff += hbs * 8 * y_stride;
3408 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3409 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3410 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3411 yoff + 8 * hbs * bytesperpixel,
3412 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3417 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3418 bp = PARTITION_SPLIT;
3419 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3420 decode_sb(ctx, row, col + hbs, lflvl,
3421 yoff + 8 * hbs * bytesperpixel,
3422 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3425 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3427 } else if (row + hbs < s->rows) { // FIXME why not <=?
3428 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3429 bp = PARTITION_SPLIT;
3430 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3431 yoff += hbs * 8 * y_stride;
3432 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3433 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3436 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3439 bp = PARTITION_SPLIT;
3440 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3442 s->counts.partition[bl][c][bp]++;
3445 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3446 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3448 VP9Context *s = ctx->priv_data;
3450 ptrdiff_t hbs = 4 >> bl;
3451 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3452 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3453 int bytesperpixel = s->bytesperpixel;
3456 av_assert2(b->bl == BL_8X8);
3457 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3458 } else if (s->b->bl == bl) {
3459 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3460 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3461 yoff += hbs * 8 * y_stride;
3462 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3463 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3464 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3465 yoff += hbs * 8 * bytesperpixel;
3466 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3467 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3470 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3471 if (col + hbs < s->cols) { // FIXME why not <=?
3472 if (row + hbs < s->rows) {
3473 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3474 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3475 yoff += hbs * 8 * y_stride;
3476 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3477 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3478 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3479 yoff + 8 * hbs * bytesperpixel,
3480 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3482 yoff += hbs * 8 * bytesperpixel;
3483 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3484 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3486 } else if (row + hbs < s->rows) {
3487 yoff += hbs * 8 * y_stride;
3488 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3489 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3494 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3495 uint8_t *lvl, uint8_t (*mask)[4],
3496 uint8_t *dst, ptrdiff_t ls)
3498 int y, x, bytesperpixel = s->bytesperpixel;
3500 // filter edges between columns (e.g. block1 | block2)
3501 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3502 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3503 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3504 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3505 unsigned hm = hm1 | hm2 | hm13 | hm23;
3507 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3510 int L = *l, H = L >> 4;
3511 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3513 if (hmask1[0] & x) {
3514 if (hmask2[0] & x) {
3515 av_assert2(l[8 << ss_v] == L);
3516 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3518 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3520 } else if (hm2 & x) {
3523 E |= s->filter_lut.mblim_lut[L] << 8;
3524 I |= s->filter_lut.lim_lut[L] << 8;
3525 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3527 [0](ptr, ls, E, I, H);
3529 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3530 [0](ptr, ls, E, I, H);
3532 } else if (hm2 & x) {
3533 int L = l[8 << ss_v], H = L >> 4;
3534 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3536 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3537 [0](ptr + 8 * ls, ls, E, I, H);
3545 int L = *l, H = L >> 4;
3546 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3551 E |= s->filter_lut.mblim_lut[L] << 8;
3552 I |= s->filter_lut.lim_lut[L] << 8;
3553 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3555 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3557 } else if (hm23 & x) {
3558 int L = l[8 << ss_v], H = L >> 4;
3559 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3561 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3569 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3570 uint8_t *lvl, uint8_t (*mask)[4],
3571 uint8_t *dst, ptrdiff_t ls)
3573 int y, x, bytesperpixel = s->bytesperpixel;
3576 // filter edges between rows (e.g. ------)
3578 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3579 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3580 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3582 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3585 int L = *l, H = L >> 4;
3586 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3589 if (vmask[0] & (x << (1 + ss_h))) {
3590 av_assert2(l[1 + ss_h] == L);
3591 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3593 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3595 } else if (vm & (x << (1 + ss_h))) {
3598 E |= s->filter_lut.mblim_lut[L] << 8;
3599 I |= s->filter_lut.lim_lut[L] << 8;
3600 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3601 [!!(vmask[1] & (x << (1 + ss_h)))]
3602 [1](ptr, ls, E, I, H);
3604 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3605 [1](ptr, ls, E, I, H);
3607 } else if (vm & (x << (1 + ss_h))) {
3608 int L = l[1 + ss_h], H = L >> 4;
3609 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3611 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3612 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3617 int L = *l, H = L >> 4;
3618 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3620 if (vm3 & (x << (1 + ss_h))) {
3623 E |= s->filter_lut.mblim_lut[L] << 8;
3624 I |= s->filter_lut.lim_lut[L] << 8;
3625 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3627 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3629 } else if (vm3 & (x << (1 + ss_h))) {
3630 int L = l[1 + ss_h], H = L >> 4;
3631 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3633 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3646 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3647 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3649 VP9Context *s = ctx->priv_data;
3650 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3651 uint8_t *dst = f->data[0] + yoff;
3652 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3653 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3656 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3657 // if you think of them as acting on a 8x8 block max, we can interleave
3658 // each v/h within the single x loop, but that only works if we work on
3659 // 8 pixel blocks, and we won't always do that (we want at least 16px
3660 // to use SSE2 optimizations, perhaps 32 for AVX2)
3662 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3663 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3665 for (p = 0; p < 2; p++) {
3666 dst = f->data[1 + p] + uvoff;
3667 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3668 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3672 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3674 int sb_start = ( idx * n) >> log2_n;
3675 int sb_end = ((idx + 1) * n) >> log2_n;
3676 *start = FFMIN(sb_start, n) << 3;
3677 *end = FFMIN(sb_end, n) << 3;
3680 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3681 int max_count, int update_factor)
3683 unsigned ct = ct0 + ct1, p2, p1;
3689 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3690 p2 = av_clip(p2, 1, 255);
3691 ct = FFMIN(ct, max_count);
3692 update_factor = FASTDIV(update_factor * ct, max_count);
3694 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3695 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3698 static void adapt_probs(VP9Context *s)
3701 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3702 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3705 for (i = 0; i < 4; i++)
3706 for (j = 0; j < 2; j++)
3707 for (k = 0; k < 2; k++)
3708 for (l = 0; l < 6; l++)
3709 for (m = 0; m < 6; m++) {
3710 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3711 unsigned *e = s->counts.eob[i][j][k][l][m];
3712 unsigned *c = s->counts.coef[i][j][k][l][m];
3714 if (l == 0 && m >= 3) // dc only has 3 pt
3717 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3718 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3719 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3722 if (s->s.h.keyframe || s->s.h.intraonly) {
3723 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3724 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3725 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3726 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3731 for (i = 0; i < 3; i++)
3732 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3735 for (i = 0; i < 4; i++)
3736 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3739 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3740 for (i = 0; i < 5; i++)
3741 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3745 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3746 for (i = 0; i < 5; i++)
3747 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3748 s->counts.comp_ref[i][1], 20, 128);
3751 if (s->s.h.comppredmode != PRED_COMPREF) {
3752 for (i = 0; i < 5; i++) {
3753 uint8_t *pp = p->single_ref[i];
3754 unsigned (*c)[2] = s->counts.single_ref[i];
3756 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3757 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3761 // block partitioning
3762 for (i = 0; i < 4; i++)
3763 for (j = 0; j < 4; j++) {
3764 uint8_t *pp = p->partition[i][j];
3765 unsigned *c = s->counts.partition[i][j];
3767 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3768 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3769 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3773 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3774 for (i = 0; i < 2; i++) {
3775 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3777 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3778 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3779 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3780 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3781 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3782 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3786 // interpolation filter
3787 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3788 for (i = 0; i < 4; i++) {
3789 uint8_t *pp = p->filter[i];
3790 unsigned *c = s->counts.filter[i];
3792 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3793 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3798 for (i = 0; i < 7; i++) {
3799 uint8_t *pp = p->mv_mode[i];
3800 unsigned *c = s->counts.mv_mode[i];
3802 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3803 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3804 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3809 uint8_t *pp = p->mv_joint;
3810 unsigned *c = s->counts.mv_joint;
3812 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3813 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3814 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3818 for (i = 0; i < 2; i++) {
3820 unsigned *c, (*c2)[2], sum;
3822 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3823 s->counts.mv_comp[i].sign[1], 20, 128);
3825 pp = p->mv_comp[i].classes;
3826 c = s->counts.mv_comp[i].classes;
3827 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3828 adapt_prob(&pp[0], c[0], sum, 20, 128);
3830 adapt_prob(&pp[1], c[1], sum, 20, 128);
3832 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3833 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3835 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3836 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3838 adapt_prob(&pp[6], c[6], sum, 20, 128);
3839 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3840 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3841 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3843 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3844 s->counts.mv_comp[i].class0[1], 20, 128);
3845 pp = p->mv_comp[i].bits;
3846 c2 = s->counts.mv_comp[i].bits;
3847 for (j = 0; j < 10; j++)
3848 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3850 for (j = 0; j < 2; j++) {
3851 pp = p->mv_comp[i].class0_fp[j];
3852 c = s->counts.mv_comp[i].class0_fp[j];
3853 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3854 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3855 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3857 pp = p->mv_comp[i].fp;
3858 c = s->counts.mv_comp[i].fp;
3859 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3860 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3861 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3863 if (s->s.h.highprecisionmvs) {
3864 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3865 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3866 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3867 s->counts.mv_comp[i].hp[1], 20, 128);
3872 for (i = 0; i < 4; i++) {
3873 uint8_t *pp = p->y_mode[i];
3874 unsigned *c = s->counts.y_mode[i], sum, s2;
3876 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3877 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3878 sum -= c[TM_VP8_PRED];
3879 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3880 sum -= c[VERT_PRED];
3881 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3882 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3884 adapt_prob(&pp[3], s2, sum, 20, 128);
3886 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3887 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3888 sum -= c[DIAG_DOWN_LEFT_PRED];
3889 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3890 sum -= c[VERT_LEFT_PRED];
3891 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3892 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3896 for (i = 0; i < 10; i++) {
3897 uint8_t *pp = p->uv_mode[i];
3898 unsigned *c = s->counts.uv_mode[i], sum, s2;
3900 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3901 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3902 sum -= c[TM_VP8_PRED];
3903 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3904 sum -= c[VERT_PRED];
3905 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3906 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3908 adapt_prob(&pp[3], s2, sum, 20, 128);
3910 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3911 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3912 sum -= c[DIAG_DOWN_LEFT_PRED];
3913 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3914 sum -= c[VERT_LEFT_PRED];
3915 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3916 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3920 static void free_buffers(VP9Context *s)
3922 av_freep(&s->intra_pred_data[0]);
3923 av_freep(&s->b_base);
3924 av_freep(&s->block_base);
3927 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3929 VP9Context *s = ctx->priv_data;
3932 for (i = 0; i < 3; i++) {
3933 if (s->s.frames[i].tf.f->buf[0])
3934 vp9_unref_frame(ctx, &s->s.frames[i]);
3935 av_frame_free(&s->s.frames[i].tf.f);
3937 for (i = 0; i < 8; i++) {
3938 if (s->s.refs[i].f->buf[0])
3939 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3940 av_frame_free(&s->s.refs[i].f);
3941 if (s->next_refs[i].f->buf[0])
3942 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3943 av_frame_free(&s->next_refs[i].f);
3953 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3954 int *got_frame, AVPacket *pkt)
3956 const uint8_t *data = pkt->data;
3957 int size = pkt->size;
3958 VP9Context *s = ctx->priv_data;
3959 int res, tile_row, tile_col, i, ref, row, col;
3960 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3961 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3962 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3966 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3968 } else if (res == 0) {
3969 if (!s->s.refs[ref].f->buf[0]) {
3970 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3971 return AVERROR_INVALIDDATA;
3973 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3975 ((AVFrame *)frame)->pkt_pts = pkt->pts;
3976 ((AVFrame *)frame)->pkt_dts = pkt->dts;
3977 for (i = 0; i < 8; i++) {
3978 if (s->next_refs[i].f->buf[0])
3979 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3980 if (s->s.refs[i].f->buf[0] &&
3981 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
3990 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
3991 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
3992 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
3993 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
3994 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
3997 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
3998 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
3999 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4000 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4002 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4003 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4004 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4006 f = s->s.frames[CUR_FRAME].tf.f;
4007 f->key_frame = s->s.h.keyframe;
4008 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4009 ls_y = f->linesize[0];
4010 ls_uv =f->linesize[1];
4012 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4013 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4014 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4015 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4019 for (i = 0; i < 8; i++) {
4020 if (s->next_refs[i].f->buf[0])
4021 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4022 if (s->s.h.refreshrefmask & (1 << i)) {
4023 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4024 } else if (s->s.refs[i].f->buf[0]) {
4025 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4032 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4035 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4038 res = ctx->hwaccel->end_frame(ctx);
4044 // main tile decode loop
4045 bytesperpixel = s->bytesperpixel;
4046 memset(s->above_partition_ctx, 0, s->cols);
4047 memset(s->above_skip_ctx, 0, s->cols);
4048 if (s->s.h.keyframe || s->s.h.intraonly) {
4049 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4051 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4053 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4054 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4055 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4056 memset(s->above_segpred_ctx, 0, s->cols);
4057 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4058 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4059 if ((res = update_block_buffers(ctx)) < 0) {
4060 av_log(ctx, AV_LOG_ERROR,
4061 "Failed to allocate block buffers\n");
4064 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4067 for (i = 0; i < 4; i++) {
4068 for (j = 0; j < 2; j++)
4069 for (k = 0; k < 2; k++)
4070 for (l = 0; l < 6; l++)
4071 for (m = 0; m < 6; m++)
4072 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4073 s->prob.coef[i][j][k][l][m], 3);
4074 if (s->s.h.txfmmode == i)
4077 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4078 ff_thread_finish_setup(ctx);
4079 } else if (!s->s.h.refreshctx) {
4080 ff_thread_finish_setup(ctx);
4086 s->block = s->block_base;
4087 s->uvblock[0] = s->uvblock_base[0];
4088 s->uvblock[1] = s->uvblock_base[1];
4089 s->eob = s->eob_base;
4090 s->uveob[0] = s->uveob_base[0];
4091 s->uveob[1] = s->uveob_base[1];
4093 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4094 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4095 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4097 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4100 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4101 tile_row == s->s.h.tiling.tile_rows - 1) {
4104 tile_size = AV_RB32(data);
4108 if (tile_size > size) {
4109 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4110 return AVERROR_INVALIDDATA;
4112 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4113 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4114 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4115 return AVERROR_INVALIDDATA;
4122 for (row = s->tile_row_start; row < s->tile_row_end;
4123 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4124 struct VP9Filter *lflvl_ptr = s->lflvl;
4125 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4127 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4128 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4129 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4132 memset(s->left_partition_ctx, 0, 8);
4133 memset(s->left_skip_ctx, 0, 8);
4134 if (s->s.h.keyframe || s->s.h.intraonly) {
4135 memset(s->left_mode_ctx, DC_PRED, 16);
4137 memset(s->left_mode_ctx, NEARESTMV, 8);
4139 memset(s->left_y_nnz_ctx, 0, 16);
4140 memset(s->left_uv_nnz_ctx, 0, 32);
4141 memset(s->left_segpred_ctx, 0, 8);
4143 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4146 for (col = s->tile_col_start;
4147 col < s->tile_col_end;
4148 col += 8, yoff2 += 64 * bytesperpixel,
4149 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4150 // FIXME integrate with lf code (i.e. zero after each
4151 // use, similar to invtxfm coefficients, or similar)
4153 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4157 decode_sb_mem(ctx, row, col, lflvl_ptr,
4158 yoff2, uvoff2, BL_64X64);
4160 decode_sb(ctx, row, col, lflvl_ptr,
4161 yoff2, uvoff2, BL_64X64);
4165 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4173 // backup pre-loopfilter reconstruction data for intra
4174 // prediction of next row of sb64s
4175 if (row + 8 < s->rows) {
4176 memcpy(s->intra_pred_data[0],
4177 f->data[0] + yoff + 63 * ls_y,
4178 8 * s->cols * bytesperpixel);
4179 memcpy(s->intra_pred_data[1],
4180 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4181 8 * s->cols * bytesperpixel >> s->ss_h);
4182 memcpy(s->intra_pred_data[2],
4183 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4184 8 * s->cols * bytesperpixel >> s->ss_h);
4187 // loopfilter one row
4188 if (s->s.h.filter.level) {
4191 lflvl_ptr = s->lflvl;
4192 for (col = 0; col < s->cols;
4193 col += 8, yoff2 += 64 * bytesperpixel,
4194 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4195 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4199 // FIXME maybe we can make this more finegrained by running the
4200 // loopfilter per-block instead of after each sbrow
4201 // In fact that would also make intra pred left preparation easier?
4202 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4206 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4208 ff_thread_finish_setup(ctx);
4210 } while (s->pass++ == 1);
4211 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4215 for (i = 0; i < 8; i++) {
4216 if (s->s.refs[i].f->buf[0])
4217 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4218 if (s->next_refs[i].f->buf[0] &&
4219 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4223 if (!s->s.h.invisible) {
4224 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4232 static void vp9_decode_flush(AVCodecContext *ctx)
4234 VP9Context *s = ctx->priv_data;
4237 for (i = 0; i < 3; i++)
4238 vp9_unref_frame(ctx, &s->s.frames[i]);
4239 for (i = 0; i < 8; i++)
4240 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4243 static int init_frames(AVCodecContext *ctx)
4245 VP9Context *s = ctx->priv_data;
4248 for (i = 0; i < 3; i++) {
4249 s->s.frames[i].tf.f = av_frame_alloc();
4250 if (!s->s.frames[i].tf.f) {
4251 vp9_decode_free(ctx);
4252 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4253 return AVERROR(ENOMEM);
4256 for (i = 0; i < 8; i++) {
4257 s->s.refs[i].f = av_frame_alloc();
4258 s->next_refs[i].f = av_frame_alloc();
4259 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4260 vp9_decode_free(ctx);
4261 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4262 return AVERROR(ENOMEM);
4269 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4271 VP9Context *s = ctx->priv_data;
4273 ctx->internal->allocate_progress = 1;
4275 s->s.h.filter.sharpness = -1;
4277 return init_frames(ctx);
4281 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4283 return init_frames(avctx);
4286 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4289 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4291 // detect size changes in other threads
4292 if (s->intra_pred_data[0] &&
4293 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols ||
4294 s->rows != ssrc->rows || s->bpp != ssrc->bpp || s->pix_fmt != ssrc->pix_fmt)) {
4298 for (i = 0; i < 3; i++) {
4299 if (s->s.frames[i].tf.f->buf[0])
4300 vp9_unref_frame(dst, &s->s.frames[i]);
4301 if (ssrc->s.frames[i].tf.f->buf[0]) {
4302 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4306 for (i = 0; i < 8; i++) {
4307 if (s->s.refs[i].f->buf[0])
4308 ff_thread_release_buffer(dst, &s->s.refs[i]);
4309 if (ssrc->next_refs[i].f->buf[0]) {
4310 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4315 s->s.h.invisible = ssrc->s.h.invisible;
4316 s->s.h.keyframe = ssrc->s.h.keyframe;
4317 s->s.h.intraonly = ssrc->s.h.intraonly;
4318 s->ss_v = ssrc->ss_v;
4319 s->ss_h = ssrc->ss_h;
4320 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4321 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4322 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4323 s->bytesperpixel = ssrc->bytesperpixel;
4325 s->bpp_index = ssrc->bpp_index;
4326 s->pix_fmt = ssrc->pix_fmt;
4327 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4328 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4329 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4330 sizeof(s->s.h.segmentation.feat));
4336 static const AVProfile profiles[] = {
4337 { FF_PROFILE_VP9_0, "Profile 0" },
4338 { FF_PROFILE_VP9_1, "Profile 1" },
4339 { FF_PROFILE_VP9_2, "Profile 2" },
4340 { FF_PROFILE_VP9_3, "Profile 3" },
4341 { FF_PROFILE_UNKNOWN },
4344 AVCodec ff_vp9_decoder = {
4346 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4347 .type = AVMEDIA_TYPE_VIDEO,
4348 .id = AV_CODEC_ID_VP9,
4349 .priv_data_size = sizeof(VP9Context),
4350 .init = vp9_decode_init,
4351 .close = vp9_decode_free,
4352 .decode = vp9_decode_frame,
4353 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4354 .flush = vp9_decode_flush,
4355 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4356 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4357 .profiles = NULL_IF_CONFIG_SMALL(profiles),