2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include "libavutil/avassert.h"
34 #include "libavutil/pixdesc.h"
36 #define VP9_SYNCCODE 0x498342
40 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
41 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
44 typedef struct VP9Block {
45 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
46 enum FilterMode filter;
47 VP56mv mv[4 /* b_idx */][2 /* ref */];
49 enum TxfmMode tx, uvtx;
51 enum BlockPartition bp;
54 typedef struct VP9Context {
65 int row, row7, col, col7;
67 ptrdiff_t y_stride, uv_stride;
70 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
71 uint8_t last_keyframe;
72 enum AVPixelFormat pix_fmt, last_fmt;
73 ThreadFrame next_refs[8];
77 uint8_t mblim_lut[64];
79 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
80 unsigned sb_cols, sb_rows, rows, cols;
83 uint8_t coef[4][2][2][6][6][3];
87 uint8_t coef[4][2][2][6][6][11];
90 unsigned y_mode[4][10];
91 unsigned uv_mode[10][10];
92 unsigned filter[4][3];
93 unsigned mv_mode[7][4];
96 unsigned single_ref[5][2][2];
97 unsigned comp_ref[5][2];
102 unsigned mv_joint[4];
105 unsigned classes[11];
107 unsigned bits[10][2];
108 unsigned class0_fp[2][4];
110 unsigned class0_hp[2];
113 unsigned partition[4][4][4];
114 unsigned coef[4][2][2][6][6][3];
115 unsigned eob[4][2][2][6][6][2];
118 // contextual (left/above) cache
119 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
120 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
121 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
122 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
123 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
124 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
125 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
126 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
127 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
128 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
129 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
131 uint8_t *above_partition_ctx;
132 uint8_t *above_mode_ctx;
133 // FIXME maybe merge some of the below in a flags field?
134 uint8_t *above_y_nnz_ctx;
135 uint8_t *above_uv_nnz_ctx[2];
136 uint8_t *above_skip_ctx; // 1bit
137 uint8_t *above_txfm_ctx; // 2bit
138 uint8_t *above_segpred_ctx; // 1bit
139 uint8_t *above_intra_ctx; // 1bit
140 uint8_t *above_comp_ctx; // 1bit
141 uint8_t *above_ref_ctx; // 2bit
142 uint8_t *above_filter_ctx;
143 VP56mv (*above_mv_ctx)[2];
146 uint8_t *intra_pred_data[3];
147 struct VP9Filter *lflvl;
148 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
150 // block reconstruction intermediates
151 int block_alloc_using_2pass;
152 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
153 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
154 struct { int x, y; } min_mv, max_mv;
155 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
156 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
157 uint16_t mvscale[3][2];
158 uint8_t mvstep[3][2];
161 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
163 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
164 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
166 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
167 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
171 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
173 ff_thread_release_buffer(ctx, &f->tf);
174 av_buffer_unref(&f->extradata);
175 av_buffer_unref(&f->hwaccel_priv_buf);
176 f->segmentation_map = NULL;
177 f->hwaccel_picture_private = NULL;
180 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
182 VP9Context *s = ctx->priv_data;
185 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
187 sz = 64 * s->sb_cols * s->sb_rows;
188 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
192 f->segmentation_map = f->extradata->data;
193 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
196 const AVHWAccel *hwaccel = ctx->hwaccel;
197 av_assert0(!f->hwaccel_picture_private);
198 if (hwaccel->frame_priv_data_size) {
199 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
200 if (!f->hwaccel_priv_buf)
202 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
209 vp9_unref_frame(ctx, f);
210 return AVERROR(ENOMEM);
213 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
217 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
219 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
223 dst->segmentation_map = src->segmentation_map;
225 dst->uses_2pass = src->uses_2pass;
227 if (src->hwaccel_picture_private) {
228 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
229 if (!dst->hwaccel_priv_buf)
231 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
237 vp9_unref_frame(ctx, dst);
238 return AVERROR(ENOMEM);
241 static int update_size(AVCodecContext *ctx, int w, int h)
243 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
244 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
245 VP9Context *s = ctx->priv_data;
247 int bytesperpixel = s->bytesperpixel, res;
249 av_assert0(w > 0 && h > 0);
251 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && s->pix_fmt == s->last_fmt)
254 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
257 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
258 #if CONFIG_VP9_DXVA2_HWACCEL
259 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
261 #if CONFIG_VP9_D3D11VA_HWACCEL
262 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
264 #if CONFIG_VP9_VAAPI_HWACCEL
265 *fmtp++ = AV_PIX_FMT_VAAPI;
269 *fmtp++ = s->pix_fmt;
270 *fmtp = AV_PIX_FMT_NONE;
272 res = ff_thread_get_format(ctx, pix_fmts);
277 s->last_fmt = s->pix_fmt;
278 s->sb_cols = (w + 63) >> 6;
279 s->sb_rows = (h + 63) >> 6;
280 s->cols = (w + 7) >> 3;
281 s->rows = (h + 7) >> 3;
283 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
284 av_freep(&s->intra_pred_data[0]);
285 // FIXME we slightly over-allocate here for subsampled chroma, but a little
286 // bit of padding shouldn't affect performance...
287 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
288 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
290 return AVERROR(ENOMEM);
291 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
292 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
293 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
294 assign(s->above_y_nnz_ctx, uint8_t *, 16);
295 assign(s->above_mode_ctx, uint8_t *, 16);
296 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
297 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
298 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
299 assign(s->above_partition_ctx, uint8_t *, 8);
300 assign(s->above_skip_ctx, uint8_t *, 8);
301 assign(s->above_txfm_ctx, uint8_t *, 8);
302 assign(s->above_segpred_ctx, uint8_t *, 8);
303 assign(s->above_intra_ctx, uint8_t *, 8);
304 assign(s->above_comp_ctx, uint8_t *, 8);
305 assign(s->above_ref_ctx, uint8_t *, 8);
306 assign(s->above_filter_ctx, uint8_t *, 8);
307 assign(s->lflvl, struct VP9Filter *, 1);
310 // these will be re-allocated a little later
311 av_freep(&s->b_base);
312 av_freep(&s->block_base);
314 if (s->bpp != s->last_bpp) {
315 ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
316 ff_videodsp_init(&s->vdsp, s->bpp);
317 s->last_bpp = s->bpp;
323 static int update_block_buffers(AVCodecContext *ctx)
325 VP9Context *s = ctx->priv_data;
326 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
328 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
332 av_free(s->block_base);
333 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
334 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
335 if (s->s.frames[CUR_FRAME].uses_2pass) {
336 int sbs = s->sb_cols * s->sb_rows;
338 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
339 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
340 16 * 16 + 2 * chroma_eobs) * sbs);
341 if (!s->b_base || !s->block_base)
342 return AVERROR(ENOMEM);
343 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
344 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
345 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
346 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
347 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
349 s->b_base = av_malloc(sizeof(VP9Block));
350 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
351 16 * 16 + 2 * chroma_eobs);
352 if (!s->b_base || !s->block_base)
353 return AVERROR(ENOMEM);
354 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
355 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
356 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
357 s->uveob_base[0] = s->eob_base + 16 * 16;
358 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
360 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
365 // for some reason the sign bit is at the end, not the start, of a bit sequence
366 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
368 int v = get_bits(gb, n);
369 return get_bits1(gb) ? -v : v;
372 static av_always_inline int inv_recenter_nonneg(int v, int m)
374 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
377 // differential forward probability updates
378 static int update_prob(VP56RangeCoder *c, int p)
380 static const int inv_map_table[255] = {
381 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
382 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
383 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
384 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
385 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
386 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
387 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
388 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
389 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
390 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
391 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
392 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
393 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
394 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
395 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
396 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
397 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
398 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
403 /* This code is trying to do a differential probability update. For a
404 * current probability A in the range [1, 255], the difference to a new
405 * probability of any value can be expressed differentially as 1-A,255-A
406 * where some part of this (absolute range) exists both in positive as
407 * well as the negative part, whereas another part only exists in one
408 * half. We're trying to code this shared part differentially, i.e.
409 * times two where the value of the lowest bit specifies the sign, and
410 * the single part is then coded on top of this. This absolute difference
411 * then again has a value of [0,254], but a bigger value in this range
412 * indicates that we're further away from the original value A, so we
413 * can code this as a VLC code, since higher values are increasingly
414 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
415 * updates vs. the 'fine, exact' updates further down the range, which
416 * adds one extra dimension to this differential update model. */
418 if (!vp8_rac_get(c)) {
419 d = vp8_rac_get_uint(c, 4) + 0;
420 } else if (!vp8_rac_get(c)) {
421 d = vp8_rac_get_uint(c, 4) + 16;
422 } else if (!vp8_rac_get(c)) {
423 d = vp8_rac_get_uint(c, 5) + 32;
425 d = vp8_rac_get_uint(c, 7);
427 d = (d << 1) - 65 + vp8_rac_get(c);
429 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
432 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
433 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
436 static int read_colorspace_details(AVCodecContext *ctx)
438 static const enum AVColorSpace colorspaces[8] = {
439 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
440 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
442 VP9Context *s = ctx->priv_data;
443 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
446 s->bpp = 8 + bits * 2;
447 s->bytesperpixel = (7 + s->bpp) >> 3;
448 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
449 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
450 static const enum AVPixelFormat pix_fmt_rgb[3] = {
451 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
453 s->ss_h = s->ss_v = 0;
454 ctx->color_range = AVCOL_RANGE_JPEG;
455 s->pix_fmt = pix_fmt_rgb[bits];
456 if (ctx->profile & 1) {
457 if (get_bits1(&s->gb)) {
458 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
459 return AVERROR_INVALIDDATA;
462 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
464 return AVERROR_INVALIDDATA;
467 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
468 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
469 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
470 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
471 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
472 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
473 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
475 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
476 if (ctx->profile & 1) {
477 s->ss_h = get_bits1(&s->gb);
478 s->ss_v = get_bits1(&s->gb);
479 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
480 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
481 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
483 return AVERROR_INVALIDDATA;
484 } else if (get_bits1(&s->gb)) {
485 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
487 return AVERROR_INVALIDDATA;
490 s->ss_h = s->ss_v = 1;
491 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
498 static int decode_frame_header(AVCodecContext *ctx,
499 const uint8_t *data, int size, int *ref)
501 VP9Context *s = ctx->priv_data;
502 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
504 const uint8_t *data2;
507 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
508 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
511 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
512 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
513 return AVERROR_INVALIDDATA;
515 ctx->profile = get_bits1(&s->gb);
516 ctx->profile |= get_bits1(&s->gb) << 1;
517 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
518 if (ctx->profile > 3) {
519 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
520 return AVERROR_INVALIDDATA;
522 s->s.h.profile = ctx->profile;
523 if (get_bits1(&s->gb)) {
524 *ref = get_bits(&s->gb, 3);
527 s->last_keyframe = s->s.h.keyframe;
528 s->s.h.keyframe = !get_bits1(&s->gb);
529 last_invisible = s->s.h.invisible;
530 s->s.h.invisible = !get_bits1(&s->gb);
531 s->s.h.errorres = get_bits1(&s->gb);
532 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
533 if (s->s.h.keyframe) {
534 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
535 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
536 return AVERROR_INVALIDDATA;
538 if ((res = read_colorspace_details(ctx)) < 0)
540 // for profile 1, here follows the subsampling bits
541 s->s.h.refreshrefmask = 0xff;
542 w = get_bits(&s->gb, 16) + 1;
543 h = get_bits(&s->gb, 16) + 1;
544 if (get_bits1(&s->gb)) // display size
545 skip_bits(&s->gb, 32);
547 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
548 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
549 if (s->s.h.intraonly) {
550 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
551 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
552 return AVERROR_INVALIDDATA;
554 if (ctx->profile >= 1) {
555 if ((res = read_colorspace_details(ctx)) < 0)
558 s->ss_h = s->ss_v = 1;
561 s->bytesperpixel = 1;
562 s->pix_fmt = AV_PIX_FMT_YUV420P;
563 ctx->colorspace = AVCOL_SPC_BT470BG;
564 ctx->color_range = AVCOL_RANGE_JPEG;
566 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
567 w = get_bits(&s->gb, 16) + 1;
568 h = get_bits(&s->gb, 16) + 1;
569 if (get_bits1(&s->gb)) // display size
570 skip_bits(&s->gb, 32);
572 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
573 s->s.h.refidx[0] = get_bits(&s->gb, 3);
574 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
575 s->s.h.refidx[1] = get_bits(&s->gb, 3);
576 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
577 s->s.h.refidx[2] = get_bits(&s->gb, 3);
578 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
579 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
580 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
581 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
582 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
583 return AVERROR_INVALIDDATA;
585 if (get_bits1(&s->gb)) {
586 w = s->s.refs[s->s.h.refidx[0]].f->width;
587 h = s->s.refs[s->s.h.refidx[0]].f->height;
588 } else if (get_bits1(&s->gb)) {
589 w = s->s.refs[s->s.h.refidx[1]].f->width;
590 h = s->s.refs[s->s.h.refidx[1]].f->height;
591 } else if (get_bits1(&s->gb)) {
592 w = s->s.refs[s->s.h.refidx[2]].f->width;
593 h = s->s.refs[s->s.h.refidx[2]].f->height;
595 w = get_bits(&s->gb, 16) + 1;
596 h = get_bits(&s->gb, 16) + 1;
598 // Note that in this code, "CUR_FRAME" is actually before we
599 // have formally allocated a frame, and thus actually represents
601 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
602 s->s.frames[CUR_FRAME].tf.f->height == h;
603 if (get_bits1(&s->gb)) // display size
604 skip_bits(&s->gb, 32);
605 s->s.h.highprecisionmvs = get_bits1(&s->gb);
606 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
608 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
609 s->s.h.signbias[0] != s->s.h.signbias[2];
610 if (s->s.h.allowcompinter) {
611 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
612 s->s.h.fixcompref = 2;
613 s->s.h.varcompref[0] = 0;
614 s->s.h.varcompref[1] = 1;
615 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
616 s->s.h.fixcompref = 1;
617 s->s.h.varcompref[0] = 0;
618 s->s.h.varcompref[1] = 2;
620 s->s.h.fixcompref = 0;
621 s->s.h.varcompref[0] = 1;
622 s->s.h.varcompref[1] = 2;
627 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
628 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
629 s->s.h.framectxid = c = get_bits(&s->gb, 2);
631 /* loopfilter header data */
632 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
633 // reset loopfilter defaults
634 s->s.h.lf_delta.ref[0] = 1;
635 s->s.h.lf_delta.ref[1] = 0;
636 s->s.h.lf_delta.ref[2] = -1;
637 s->s.h.lf_delta.ref[3] = -1;
638 s->s.h.lf_delta.mode[0] = 0;
639 s->s.h.lf_delta.mode[1] = 0;
640 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
642 s->s.h.filter.level = get_bits(&s->gb, 6);
643 sharp = get_bits(&s->gb, 3);
644 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
645 // the old cache values since they are still valid
646 if (s->s.h.filter.sharpness != sharp)
647 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
648 s->s.h.filter.sharpness = sharp;
649 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
650 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
651 for (i = 0; i < 4; i++)
652 if (get_bits1(&s->gb))
653 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
654 for (i = 0; i < 2; i++)
655 if (get_bits1(&s->gb))
656 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
660 /* quantization header data */
661 s->s.h.yac_qi = get_bits(&s->gb, 8);
662 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
663 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
664 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
665 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
666 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
668 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
670 /* segmentation header info */
671 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
672 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
673 for (i = 0; i < 7; i++)
674 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
675 get_bits(&s->gb, 8) : 255;
676 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
677 for (i = 0; i < 3; i++)
678 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
679 get_bits(&s->gb, 8) : 255;
683 if (get_bits1(&s->gb)) {
684 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
685 for (i = 0; i < 8; i++) {
686 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
687 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
688 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
689 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
690 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
691 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
692 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
697 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
698 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
699 int qyac, qydc, quvac, quvdc, lflvl, sh;
701 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
702 if (s->s.h.segmentation.absolute_vals)
703 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
705 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
707 qyac = s->s.h.yac_qi;
709 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
710 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
711 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
712 qyac = av_clip_uintp2(qyac, 8);
714 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
715 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
716 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
717 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
719 sh = s->s.h.filter.level >= 32;
720 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
721 if (s->s.h.segmentation.absolute_vals)
722 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
724 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
726 lflvl = s->s.h.filter.level;
728 if (s->s.h.lf_delta.enabled) {
729 s->s.h.segmentation.feat[i].lflvl[0][0] =
730 s->s.h.segmentation.feat[i].lflvl[0][1] =
731 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] << sh), 6);
732 for (j = 1; j < 4; j++) {
733 s->s.h.segmentation.feat[i].lflvl[j][0] =
734 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
735 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
736 s->s.h.segmentation.feat[i].lflvl[j][1] =
737 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
738 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
741 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
742 sizeof(s->s.h.segmentation.feat[i].lflvl));
747 if ((res = update_size(ctx, w, h)) < 0) {
748 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
752 for (s->s.h.tiling.log2_tile_cols = 0;
753 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
754 s->s.h.tiling.log2_tile_cols++) ;
755 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
756 max = FFMAX(0, max - 1);
757 while (max > s->s.h.tiling.log2_tile_cols) {
758 if (get_bits1(&s->gb))
759 s->s.h.tiling.log2_tile_cols++;
763 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
764 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
765 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
766 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
767 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
768 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
770 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
771 return AVERROR(ENOMEM);
775 /* check reference frames */
776 if (!s->s.h.keyframe && !s->s.h.intraonly) {
777 for (i = 0; i < 3; i++) {
778 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
779 int refw = ref->width, refh = ref->height;
781 if (ref->format != ctx->pix_fmt) {
782 av_log(ctx, AV_LOG_ERROR,
783 "Ref pixfmt (%s) did not match current frame (%s)",
784 av_get_pix_fmt_name(ref->format),
785 av_get_pix_fmt_name(ctx->pix_fmt));
786 return AVERROR_INVALIDDATA;
787 } else if (refw == w && refh == h) {
788 s->mvscale[i][0] = s->mvscale[i][1] = 0;
790 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
791 av_log(ctx, AV_LOG_ERROR,
792 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
794 return AVERROR_INVALIDDATA;
796 s->mvscale[i][0] = (refw << 14) / w;
797 s->mvscale[i][1] = (refh << 14) / h;
798 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
799 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
804 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
805 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
806 s->prob_ctx[3].p = vp9_default_probs;
807 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
808 sizeof(vp9_default_coef_probs));
809 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
810 sizeof(vp9_default_coef_probs));
811 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
812 sizeof(vp9_default_coef_probs));
813 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
814 sizeof(vp9_default_coef_probs));
815 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
816 s->prob_ctx[c].p = vp9_default_probs;
817 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
818 sizeof(vp9_default_coef_probs));
821 // next 16 bits is size of the rest of the header (arith-coded)
822 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
823 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
825 data2 = align_get_bits(&s->gb);
826 if (size2 > size - (data2 - data)) {
827 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
828 return AVERROR_INVALIDDATA;
830 ff_vp56_init_range_decoder(&s->c, data2, size2);
831 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
832 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
833 return AVERROR_INVALIDDATA;
836 if (s->s.h.keyframe || s->s.h.intraonly) {
837 memset(s->counts.coef, 0, sizeof(s->counts.coef));
838 memset(s->counts.eob, 0, sizeof(s->counts.eob));
840 memset(&s->counts, 0, sizeof(s->counts));
842 // FIXME is it faster to not copy here, but do it down in the fw updates
843 // as explicit copies if the fw update is missing (and skip the copy upon
845 s->prob.p = s->prob_ctx[c].p;
848 if (s->s.h.lossless) {
849 s->s.h.txfmmode = TX_4X4;
851 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
852 if (s->s.h.txfmmode == 3)
853 s->s.h.txfmmode += vp8_rac_get(&s->c);
855 if (s->s.h.txfmmode == TX_SWITCHABLE) {
856 for (i = 0; i < 2; i++)
857 if (vp56_rac_get_prob_branchy(&s->c, 252))
858 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
859 for (i = 0; i < 2; i++)
860 for (j = 0; j < 2; j++)
861 if (vp56_rac_get_prob_branchy(&s->c, 252))
862 s->prob.p.tx16p[i][j] =
863 update_prob(&s->c, s->prob.p.tx16p[i][j]);
864 for (i = 0; i < 2; i++)
865 for (j = 0; j < 3; j++)
866 if (vp56_rac_get_prob_branchy(&s->c, 252))
867 s->prob.p.tx32p[i][j] =
868 update_prob(&s->c, s->prob.p.tx32p[i][j]);
873 for (i = 0; i < 4; i++) {
874 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
875 if (vp8_rac_get(&s->c)) {
876 for (j = 0; j < 2; j++)
877 for (k = 0; k < 2; k++)
878 for (l = 0; l < 6; l++)
879 for (m = 0; m < 6; m++) {
880 uint8_t *p = s->prob.coef[i][j][k][l][m];
881 uint8_t *r = ref[j][k][l][m];
882 if (m >= 3 && l == 0) // dc only has 3 pt
884 for (n = 0; n < 3; n++) {
885 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
886 p[n] = update_prob(&s->c, r[n]);
894 for (j = 0; j < 2; j++)
895 for (k = 0; k < 2; k++)
896 for (l = 0; l < 6; l++)
897 for (m = 0; m < 6; m++) {
898 uint8_t *p = s->prob.coef[i][j][k][l][m];
899 uint8_t *r = ref[j][k][l][m];
900 if (m > 3 && l == 0) // dc only has 3 pt
906 if (s->s.h.txfmmode == i)
911 for (i = 0; i < 3; i++)
912 if (vp56_rac_get_prob_branchy(&s->c, 252))
913 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
914 if (!s->s.h.keyframe && !s->s.h.intraonly) {
915 for (i = 0; i < 7; i++)
916 for (j = 0; j < 3; j++)
917 if (vp56_rac_get_prob_branchy(&s->c, 252))
918 s->prob.p.mv_mode[i][j] =
919 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
921 if (s->s.h.filtermode == FILTER_SWITCHABLE)
922 for (i = 0; i < 4; i++)
923 for (j = 0; j < 2; j++)
924 if (vp56_rac_get_prob_branchy(&s->c, 252))
925 s->prob.p.filter[i][j] =
926 update_prob(&s->c, s->prob.p.filter[i][j]);
928 for (i = 0; i < 4; i++)
929 if (vp56_rac_get_prob_branchy(&s->c, 252))
930 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
932 if (s->s.h.allowcompinter) {
933 s->s.h.comppredmode = vp8_rac_get(&s->c);
934 if (s->s.h.comppredmode)
935 s->s.h.comppredmode += vp8_rac_get(&s->c);
936 if (s->s.h.comppredmode == PRED_SWITCHABLE)
937 for (i = 0; i < 5; i++)
938 if (vp56_rac_get_prob_branchy(&s->c, 252))
940 update_prob(&s->c, s->prob.p.comp[i]);
942 s->s.h.comppredmode = PRED_SINGLEREF;
945 if (s->s.h.comppredmode != PRED_COMPREF) {
946 for (i = 0; i < 5; i++) {
947 if (vp56_rac_get_prob_branchy(&s->c, 252))
948 s->prob.p.single_ref[i][0] =
949 update_prob(&s->c, s->prob.p.single_ref[i][0]);
950 if (vp56_rac_get_prob_branchy(&s->c, 252))
951 s->prob.p.single_ref[i][1] =
952 update_prob(&s->c, s->prob.p.single_ref[i][1]);
956 if (s->s.h.comppredmode != PRED_SINGLEREF) {
957 for (i = 0; i < 5; i++)
958 if (vp56_rac_get_prob_branchy(&s->c, 252))
959 s->prob.p.comp_ref[i] =
960 update_prob(&s->c, s->prob.p.comp_ref[i]);
963 for (i = 0; i < 4; i++)
964 for (j = 0; j < 9; j++)
965 if (vp56_rac_get_prob_branchy(&s->c, 252))
966 s->prob.p.y_mode[i][j] =
967 update_prob(&s->c, s->prob.p.y_mode[i][j]);
969 for (i = 0; i < 4; i++)
970 for (j = 0; j < 4; j++)
971 for (k = 0; k < 3; k++)
972 if (vp56_rac_get_prob_branchy(&s->c, 252))
973 s->prob.p.partition[3 - i][j][k] =
974 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
976 // mv fields don't use the update_prob subexp model for some reason
977 for (i = 0; i < 3; i++)
978 if (vp56_rac_get_prob_branchy(&s->c, 252))
979 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
981 for (i = 0; i < 2; i++) {
982 if (vp56_rac_get_prob_branchy(&s->c, 252))
983 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
985 for (j = 0; j < 10; j++)
986 if (vp56_rac_get_prob_branchy(&s->c, 252))
987 s->prob.p.mv_comp[i].classes[j] =
988 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
990 if (vp56_rac_get_prob_branchy(&s->c, 252))
991 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
993 for (j = 0; j < 10; j++)
994 if (vp56_rac_get_prob_branchy(&s->c, 252))
995 s->prob.p.mv_comp[i].bits[j] =
996 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
999 for (i = 0; i < 2; i++) {
1000 for (j = 0; j < 2; j++)
1001 for (k = 0; k < 3; k++)
1002 if (vp56_rac_get_prob_branchy(&s->c, 252))
1003 s->prob.p.mv_comp[i].class0_fp[j][k] =
1004 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1006 for (j = 0; j < 3; j++)
1007 if (vp56_rac_get_prob_branchy(&s->c, 252))
1008 s->prob.p.mv_comp[i].fp[j] =
1009 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1012 if (s->s.h.highprecisionmvs) {
1013 for (i = 0; i < 2; i++) {
1014 if (vp56_rac_get_prob_branchy(&s->c, 252))
1015 s->prob.p.mv_comp[i].class0_hp =
1016 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1018 if (vp56_rac_get_prob_branchy(&s->c, 252))
1019 s->prob.p.mv_comp[i].hp =
1020 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1025 return (data2 - data) + size2;
1028 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1031 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1032 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1035 static void find_ref_mvs(VP9Context *s,
1036 VP56mv *pmv, int ref, int z, int idx, int sb)
1038 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1039 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1040 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1041 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1042 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1043 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1044 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1045 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1046 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1047 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1048 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1049 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1050 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1051 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1052 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1053 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1054 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1055 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1056 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1057 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1058 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1059 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1060 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1061 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1062 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1063 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1064 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1067 int row = s->row, col = s->col, row7 = s->row7;
1068 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1069 #define INVALID_MV 0x80008000U
1070 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1073 #define RETURN_DIRECT_MV(mv) \
1075 uint32_t m = AV_RN32A(&mv); \
1079 } else if (mem == INVALID_MV) { \
1081 } else if (m != mem) { \
1088 if (sb == 2 || sb == 1) {
1089 RETURN_DIRECT_MV(b->mv[0][z]);
1090 } else if (sb == 3) {
1091 RETURN_DIRECT_MV(b->mv[2][z]);
1092 RETURN_DIRECT_MV(b->mv[1][z]);
1093 RETURN_DIRECT_MV(b->mv[0][z]);
1096 #define RETURN_MV(mv) \
1101 av_assert2(idx == 1); \
1102 av_assert2(mem != INVALID_MV); \
1103 if (mem_sub8x8 == INVALID_MV) { \
1104 clamp_mv(&tmp, &mv, s); \
1105 m = AV_RN32A(&tmp); \
1110 mem_sub8x8 = AV_RN32A(&mv); \
1111 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1112 clamp_mv(&tmp, &mv, s); \
1113 m = AV_RN32A(&tmp); \
1117 /* BUG I'm pretty sure this isn't the intention */ \
1123 uint32_t m = AV_RN32A(&mv); \
1125 clamp_mv(pmv, &mv, s); \
1127 } else if (mem == INVALID_MV) { \
1129 } else if (m != mem) { \
1130 clamp_mv(pmv, &mv, s); \
1137 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1138 if (mv->ref[0] == ref) {
1139 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1140 } else if (mv->ref[1] == ref) {
1141 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1144 if (col > s->tile_col_start) {
1145 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1146 if (mv->ref[0] == ref) {
1147 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1148 } else if (mv->ref[1] == ref) {
1149 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1157 // previously coded MVs in this neighbourhood, using same reference frame
1158 for (; i < 8; i++) {
1159 int c = p[i][0] + col, r = p[i][1] + row;
1161 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1162 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1164 if (mv->ref[0] == ref) {
1165 RETURN_MV(mv->mv[0]);
1166 } else if (mv->ref[1] == ref) {
1167 RETURN_MV(mv->mv[1]);
1172 // MV at this position in previous frame, using same reference frame
1173 if (s->s.h.use_last_frame_mvs) {
1174 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1176 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1177 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1178 if (mv->ref[0] == ref) {
1179 RETURN_MV(mv->mv[0]);
1180 } else if (mv->ref[1] == ref) {
1181 RETURN_MV(mv->mv[1]);
1185 #define RETURN_SCALE_MV(mv, scale) \
1188 VP56mv mv_temp = { -mv.x, -mv.y }; \
1189 RETURN_MV(mv_temp); \
1195 // previously coded MVs in this neighbourhood, using different reference frame
1196 for (i = 0; i < 8; i++) {
1197 int c = p[i][0] + col, r = p[i][1] + row;
1199 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1200 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1202 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1203 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1205 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1206 // BUG - libvpx has this condition regardless of whether
1207 // we used the first ref MV and pre-scaling
1208 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1209 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1214 // MV at this position in previous frame, using different reference frame
1215 if (s->s.h.use_last_frame_mvs) {
1216 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1218 // no need to await_progress, because we already did that above
1219 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1220 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1222 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1223 // BUG - libvpx has this condition regardless of whether
1224 // we used the first ref MV and pre-scaling
1225 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1226 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1231 clamp_mv(pmv, pmv, s);
1234 #undef RETURN_SCALE_MV
1237 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1239 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1240 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1241 s->prob.p.mv_comp[idx].classes);
1243 s->counts.mv_comp[idx].sign[sign]++;
1244 s->counts.mv_comp[idx].classes[c]++;
1248 for (n = 0, m = 0; m < c; m++) {
1249 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1251 s->counts.mv_comp[idx].bits[m][bit]++;
1254 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1256 s->counts.mv_comp[idx].fp[bit]++;
1258 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1259 s->counts.mv_comp[idx].hp[bit]++;
1263 // bug in libvpx - we count for bw entropy purposes even if the
1265 s->counts.mv_comp[idx].hp[1]++;
1269 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1270 s->counts.mv_comp[idx].class0[n]++;
1271 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1272 s->prob.p.mv_comp[idx].class0_fp[n]);
1273 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1274 n = (n << 3) | (bit << 1);
1276 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1277 s->counts.mv_comp[idx].class0_hp[bit]++;
1281 // bug in libvpx - we count for bw entropy purposes even if the
1283 s->counts.mv_comp[idx].class0_hp[1]++;
1287 return sign ? -(n + 1) : (n + 1);
1290 static void fill_mv(VP9Context *s,
1291 VP56mv *mv, int mode, int sb)
1295 if (mode == ZEROMV) {
1300 // FIXME cache this value and reuse for other subblocks
1301 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1302 mode == NEWMV ? -1 : sb);
1303 // FIXME maybe move this code into find_ref_mvs()
1304 if ((mode == NEWMV || sb == -1) &&
1305 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1319 if (mode == NEWMV) {
1320 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1321 s->prob.p.mv_joint);
1323 s->counts.mv_joint[j]++;
1324 if (j >= MV_JOINT_V)
1325 mv[0].y += read_mv_component(s, 0, hp);
1327 mv[0].x += read_mv_component(s, 1, hp);
1331 // FIXME cache this value and reuse for other subblocks
1332 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1333 mode == NEWMV ? -1 : sb);
1334 if ((mode == NEWMV || sb == -1) &&
1335 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1349 if (mode == NEWMV) {
1350 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1351 s->prob.p.mv_joint);
1353 s->counts.mv_joint[j]++;
1354 if (j >= MV_JOINT_V)
1355 mv[1].y += read_mv_component(s, 0, hp);
1357 mv[1].x += read_mv_component(s, 1, hp);
1363 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1364 ptrdiff_t stride, int v)
1374 int v16 = v * 0x0101;
1382 uint32_t v32 = v * 0x01010101;
1391 uint64_t v64 = v * 0x0101010101010101ULL;
1397 uint32_t v32 = v * 0x01010101;
1400 AV_WN32A(ptr + 4, v32);
1409 static void decode_mode(AVCodecContext *ctx)
1411 static const uint8_t left_ctx[N_BS_SIZES] = {
1412 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1414 static const uint8_t above_ctx[N_BS_SIZES] = {
1415 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1417 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1418 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1419 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1421 VP9Context *s = ctx->priv_data;
1423 int row = s->row, col = s->col, row7 = s->row7;
1424 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1425 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1426 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1427 int have_a = row > 0, have_l = col > s->tile_col_start;
1428 int vref, filter_id;
1430 if (!s->s.h.segmentation.enabled) {
1432 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1433 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1434 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1435 } else if (!s->s.h.segmentation.update_map ||
1436 (s->s.h.segmentation.temporal &&
1437 vp56_rac_get_prob_branchy(&s->c,
1438 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1439 s->left_segpred_ctx[row7]]))) {
1440 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1442 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1444 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1445 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1446 for (y = 0; y < h4; y++) {
1447 int idx_base = (y + row) * 8 * s->sb_cols + col;
1448 for (x = 0; x < w4; x++)
1449 pred = FFMIN(pred, refsegmap[idx_base + x]);
1451 av_assert1(pred < 8);
1457 memset(&s->above_segpred_ctx[col], 1, w4);
1458 memset(&s->left_segpred_ctx[row7], 1, h4);
1460 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1461 s->s.h.segmentation.prob);
1463 memset(&s->above_segpred_ctx[col], 0, w4);
1464 memset(&s->left_segpred_ctx[row7], 0, h4);
1466 if (s->s.h.segmentation.enabled &&
1467 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1468 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1469 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1472 b->skip = s->s.h.segmentation.enabled &&
1473 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1475 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1476 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1477 s->counts.skip[c][b->skip]++;
1480 if (s->s.h.keyframe || s->s.h.intraonly) {
1482 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1483 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1487 if (have_a && have_l) {
1488 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1491 c = have_a ? 2 * s->above_intra_ctx[col] :
1492 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1494 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1495 s->counts.intra[c][bit]++;
1499 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1503 c = (s->above_skip_ctx[col] ? max_tx :
1504 s->above_txfm_ctx[col]) +
1505 (s->left_skip_ctx[row7] ? max_tx :
1506 s->left_txfm_ctx[row7]) > max_tx;
1508 c = s->above_skip_ctx[col] ? 1 :
1509 (s->above_txfm_ctx[col] * 2 > max_tx);
1511 } else if (have_l) {
1512 c = s->left_skip_ctx[row7] ? 1 :
1513 (s->left_txfm_ctx[row7] * 2 > max_tx);
1519 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1521 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1523 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1525 s->counts.tx32p[c][b->tx]++;
1528 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1530 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1531 s->counts.tx16p[c][b->tx]++;
1534 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1535 s->counts.tx8p[c][b->tx]++;
1542 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1545 if (s->s.h.keyframe || s->s.h.intraonly) {
1546 uint8_t *a = &s->above_mode_ctx[col * 2];
1547 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1550 if (b->bs > BS_8x8) {
1551 // FIXME the memory storage intermediates here aren't really
1552 // necessary, they're just there to make the code slightly
1554 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1555 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1556 if (b->bs != BS_8x4) {
1557 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1558 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1559 l[0] = a[1] = b->mode[1];
1561 l[0] = a[1] = b->mode[1] = b->mode[0];
1563 if (b->bs != BS_4x8) {
1564 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1565 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1566 if (b->bs != BS_8x4) {
1567 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1568 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1569 l[1] = a[1] = b->mode[3];
1571 l[1] = a[1] = b->mode[3] = b->mode[2];
1574 b->mode[2] = b->mode[0];
1575 l[1] = a[1] = b->mode[3] = b->mode[1];
1578 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1579 vp9_default_kf_ymode_probs[*a][*l]);
1580 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1581 // FIXME this can probably be optimized
1582 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1583 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1585 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1586 vp9_default_kf_uvmode_probs[b->mode[3]]);
1587 } else if (b->intra) {
1589 if (b->bs > BS_8x8) {
1590 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1591 s->prob.p.y_mode[0]);
1592 s->counts.y_mode[0][b->mode[0]]++;
1593 if (b->bs != BS_8x4) {
1594 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1595 s->prob.p.y_mode[0]);
1596 s->counts.y_mode[0][b->mode[1]]++;
1598 b->mode[1] = b->mode[0];
1600 if (b->bs != BS_4x8) {
1601 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1602 s->prob.p.y_mode[0]);
1603 s->counts.y_mode[0][b->mode[2]]++;
1604 if (b->bs != BS_8x4) {
1605 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1606 s->prob.p.y_mode[0]);
1607 s->counts.y_mode[0][b->mode[3]]++;
1609 b->mode[3] = b->mode[2];
1612 b->mode[2] = b->mode[0];
1613 b->mode[3] = b->mode[1];
1616 static const uint8_t size_group[10] = {
1617 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1619 int sz = size_group[b->bs];
1621 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1622 s->prob.p.y_mode[sz]);
1623 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1624 s->counts.y_mode[sz][b->mode[3]]++;
1626 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1627 s->prob.p.uv_mode[b->mode[3]]);
1628 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1630 static const uint8_t inter_mode_ctx_lut[14][14] = {
1631 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1632 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1633 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1634 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1635 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1636 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1637 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1638 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1639 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1640 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1641 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1642 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1643 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1644 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1647 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1648 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1650 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1652 // read comp_pred flag
1653 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1654 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1658 // FIXME add intra as ref=0xff (or -1) to make these easier?
1661 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1663 } else if (s->above_comp_ctx[col]) {
1664 c = 2 + (s->left_intra_ctx[row7] ||
1665 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1666 } else if (s->left_comp_ctx[row7]) {
1667 c = 2 + (s->above_intra_ctx[col] ||
1668 s->above_ref_ctx[col] == s->s.h.fixcompref);
1670 c = (!s->above_intra_ctx[col] &&
1671 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1672 (!s->left_intra_ctx[row7] &&
1673 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1676 c = s->above_comp_ctx[col] ? 3 :
1677 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1679 } else if (have_l) {
1680 c = s->left_comp_ctx[row7] ? 3 :
1681 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1685 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1686 s->counts.comp[c][b->comp]++;
1689 // read actual references
1690 // FIXME probably cache a few variables here to prevent repetitive
1691 // memory accesses below
1692 if (b->comp) /* two references */ {
1693 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1695 b->ref[fix_idx] = s->s.h.fixcompref;
1696 // FIXME can this codeblob be replaced by some sort of LUT?
1699 if (s->above_intra_ctx[col]) {
1700 if (s->left_intra_ctx[row7]) {
1703 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1705 } else if (s->left_intra_ctx[row7]) {
1706 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1708 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1710 if (refl == refa && refa == s->s.h.varcompref[1]) {
1712 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1713 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1714 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1717 c = (refa == refl) ? 3 : 1;
1719 } else if (!s->left_comp_ctx[row7]) {
1720 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1723 c = (refl == s->s.h.varcompref[1] &&
1724 refa != s->s.h.varcompref[1]) ? 2 : 4;
1726 } else if (!s->above_comp_ctx[col]) {
1727 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1730 c = (refa == s->s.h.varcompref[1] &&
1731 refl != s->s.h.varcompref[1]) ? 2 : 4;
1734 c = (refl == refa) ? 4 : 2;
1738 if (s->above_intra_ctx[col]) {
1740 } else if (s->above_comp_ctx[col]) {
1741 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1743 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1746 } else if (have_l) {
1747 if (s->left_intra_ctx[row7]) {
1749 } else if (s->left_comp_ctx[row7]) {
1750 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1752 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1757 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1758 b->ref[var_idx] = s->s.h.varcompref[bit];
1759 s->counts.comp_ref[c][bit]++;
1760 } else /* single reference */ {
1763 if (have_a && !s->above_intra_ctx[col]) {
1764 if (have_l && !s->left_intra_ctx[row7]) {
1765 if (s->left_comp_ctx[row7]) {
1766 if (s->above_comp_ctx[col]) {
1767 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1768 !s->above_ref_ctx[col]);
1770 c = (3 * !s->above_ref_ctx[col]) +
1771 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1773 } else if (s->above_comp_ctx[col]) {
1774 c = (3 * !s->left_ref_ctx[row7]) +
1775 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1777 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1779 } else if (s->above_intra_ctx[col]) {
1781 } else if (s->above_comp_ctx[col]) {
1782 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1784 c = 4 * (!s->above_ref_ctx[col]);
1786 } else if (have_l && !s->left_intra_ctx[row7]) {
1787 if (s->left_intra_ctx[row7]) {
1789 } else if (s->left_comp_ctx[row7]) {
1790 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1792 c = 4 * (!s->left_ref_ctx[row7]);
1797 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1798 s->counts.single_ref[c][0][bit]++;
1802 // FIXME can this codeblob be replaced by some sort of LUT?
1805 if (s->left_intra_ctx[row7]) {
1806 if (s->above_intra_ctx[col]) {
1808 } else if (s->above_comp_ctx[col]) {
1809 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1810 s->above_ref_ctx[col] == 1);
1811 } else if (!s->above_ref_ctx[col]) {
1814 c = 4 * (s->above_ref_ctx[col] == 1);
1816 } else if (s->above_intra_ctx[col]) {
1817 if (s->left_intra_ctx[row7]) {
1819 } else if (s->left_comp_ctx[row7]) {
1820 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1821 s->left_ref_ctx[row7] == 1);
1822 } else if (!s->left_ref_ctx[row7]) {
1825 c = 4 * (s->left_ref_ctx[row7] == 1);
1827 } else if (s->above_comp_ctx[col]) {
1828 if (s->left_comp_ctx[row7]) {
1829 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1830 c = 3 * (s->s.h.fixcompref == 1 ||
1831 s->left_ref_ctx[row7] == 1);
1835 } else if (!s->left_ref_ctx[row7]) {
1836 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1837 s->above_ref_ctx[col] == 1);
1839 c = 3 * (s->left_ref_ctx[row7] == 1) +
1840 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1842 } else if (s->left_comp_ctx[row7]) {
1843 if (!s->above_ref_ctx[col]) {
1844 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1845 s->left_ref_ctx[row7] == 1);
1847 c = 3 * (s->above_ref_ctx[col] == 1) +
1848 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1850 } else if (!s->above_ref_ctx[col]) {
1851 if (!s->left_ref_ctx[row7]) {
1854 c = 4 * (s->left_ref_ctx[row7] == 1);
1856 } else if (!s->left_ref_ctx[row7]) {
1857 c = 4 * (s->above_ref_ctx[col] == 1);
1859 c = 2 * (s->left_ref_ctx[row7] == 1) +
1860 2 * (s->above_ref_ctx[col] == 1);
1863 if (s->above_intra_ctx[col] ||
1864 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1866 } else if (s->above_comp_ctx[col]) {
1867 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1869 c = 4 * (s->above_ref_ctx[col] == 1);
1872 } else if (have_l) {
1873 if (s->left_intra_ctx[row7] ||
1874 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1876 } else if (s->left_comp_ctx[row7]) {
1877 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1879 c = 4 * (s->left_ref_ctx[row7] == 1);
1884 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1885 s->counts.single_ref[c][1][bit]++;
1886 b->ref[0] = 1 + bit;
1891 if (b->bs <= BS_8x8) {
1892 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1893 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1895 static const uint8_t off[10] = {
1896 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1899 // FIXME this needs to use the LUT tables from find_ref_mvs
1900 // because not all are -1,0/0,-1
1901 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1902 [s->left_mode_ctx[row7 + off[b->bs]]];
1904 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1905 s->prob.p.mv_mode[c]);
1906 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1907 s->counts.mv_mode[c][b->mode[0] - 10]++;
1911 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1914 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1915 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1916 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1917 s->left_filter_ctx[row7] : 3;
1919 c = s->above_filter_ctx[col];
1921 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1922 c = s->left_filter_ctx[row7];
1927 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1928 s->prob.p.filter[c]);
1929 s->counts.filter[c][filter_id]++;
1930 b->filter = vp9_filter_lut[filter_id];
1932 b->filter = s->s.h.filtermode;
1935 if (b->bs > BS_8x8) {
1936 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1938 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1939 s->prob.p.mv_mode[c]);
1940 s->counts.mv_mode[c][b->mode[0] - 10]++;
1941 fill_mv(s, b->mv[0], b->mode[0], 0);
1943 if (b->bs != BS_8x4) {
1944 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1945 s->prob.p.mv_mode[c]);
1946 s->counts.mv_mode[c][b->mode[1] - 10]++;
1947 fill_mv(s, b->mv[1], b->mode[1], 1);
1949 b->mode[1] = b->mode[0];
1950 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1951 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1954 if (b->bs != BS_4x8) {
1955 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1956 s->prob.p.mv_mode[c]);
1957 s->counts.mv_mode[c][b->mode[2] - 10]++;
1958 fill_mv(s, b->mv[2], b->mode[2], 2);
1960 if (b->bs != BS_8x4) {
1961 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1962 s->prob.p.mv_mode[c]);
1963 s->counts.mv_mode[c][b->mode[3] - 10]++;
1964 fill_mv(s, b->mv[3], b->mode[3], 3);
1966 b->mode[3] = b->mode[2];
1967 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1968 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1971 b->mode[2] = b->mode[0];
1972 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1973 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1974 b->mode[3] = b->mode[1];
1975 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1976 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1979 fill_mv(s, b->mv[0], b->mode[0], -1);
1980 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1981 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1982 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1983 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1984 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1985 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1988 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
1992 #define SPLAT_CTX(var, val, n) \
1994 case 1: var = val; break; \
1995 case 2: AV_WN16A(&var, val * 0x0101); break; \
1996 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1997 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1999 uint64_t v64 = val * 0x0101010101010101ULL; \
2000 AV_WN64A( &var, v64); \
2001 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2006 #define SPLAT_CTX(var, val, n) \
2008 case 1: var = val; break; \
2009 case 2: AV_WN16A(&var, val * 0x0101); break; \
2010 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2012 uint32_t v32 = val * 0x01010101; \
2013 AV_WN32A( &var, v32); \
2014 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2018 uint32_t v32 = val * 0x01010101; \
2019 AV_WN32A( &var, v32); \
2020 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2021 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2022 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2028 switch (bwh_tab[1][b->bs][0]) {
2029 #define SET_CTXS(dir, off, n) \
2031 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2032 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2033 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2034 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2035 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2036 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2037 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2039 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2040 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2041 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2046 case 1: SET_CTXS(above, col, 1); break;
2047 case 2: SET_CTXS(above, col, 2); break;
2048 case 4: SET_CTXS(above, col, 4); break;
2049 case 8: SET_CTXS(above, col, 8); break;
2051 switch (bwh_tab[1][b->bs][1]) {
2052 case 1: SET_CTXS(left, row7, 1); break;
2053 case 2: SET_CTXS(left, row7, 2); break;
2054 case 4: SET_CTXS(left, row7, 4); break;
2055 case 8: SET_CTXS(left, row7, 8); break;
2060 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2061 if (b->bs > BS_8x8) {
2062 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2064 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2065 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2066 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2067 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2068 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2069 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2070 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2071 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2073 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2075 for (n = 0; n < w4 * 2; n++) {
2076 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2077 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2079 for (n = 0; n < h4 * 2; n++) {
2080 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2081 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2087 for (y = 0; y < h4; y++) {
2088 int x, o = (row + y) * s->sb_cols * 8 + col;
2089 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2092 for (x = 0; x < w4; x++) {
2096 } else if (b->comp) {
2097 for (x = 0; x < w4; x++) {
2098 mv[x].ref[0] = b->ref[0];
2099 mv[x].ref[1] = b->ref[1];
2100 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2101 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2104 for (x = 0; x < w4; x++) {
2105 mv[x].ref[0] = b->ref[0];
2107 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2113 // FIXME merge cnt/eob arguments?
2114 static av_always_inline int
2115 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2116 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2117 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2118 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2119 const int16_t *band_counts, const int16_t *qmul)
2121 int i = 0, band = 0, band_left = band_counts[band];
2122 uint8_t *tp = p[0][nnz];
2123 uint8_t cache[1024];
2128 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2129 eob[band][nnz][val]++;
2134 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2135 cnt[band][nnz][0]++;
2137 band_left = band_counts[++band];
2139 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2141 if (++i == n_coeffs)
2142 break; //invalid input; blocks should end with EOB
2147 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2148 cnt[band][nnz][1]++;
2152 // fill in p[3-10] (model fill) - only once per frame for each pos
2154 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2156 cnt[band][nnz][2]++;
2157 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2158 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2159 cache[rc] = val = 2;
2161 val = 3 + vp56_rac_get_prob(c, tp[5]);
2164 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2166 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2167 val = 5 + vp56_rac_get_prob(c, 159);
2169 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2170 val += vp56_rac_get_prob(c, 145);
2174 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2175 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2176 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2177 val += (vp56_rac_get_prob(c, 148) << 1);
2178 val += vp56_rac_get_prob(c, 140);
2180 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2181 val += (vp56_rac_get_prob(c, 155) << 2);
2182 val += (vp56_rac_get_prob(c, 140) << 1);
2183 val += vp56_rac_get_prob(c, 135);
2185 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2186 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2187 val += (vp56_rac_get_prob(c, 157) << 3);
2188 val += (vp56_rac_get_prob(c, 141) << 2);
2189 val += (vp56_rac_get_prob(c, 134) << 1);
2190 val += vp56_rac_get_prob(c, 130);
2193 if (!is8bitsperpixel) {
2195 val += vp56_rac_get_prob(c, 255) << 17;
2196 val += vp56_rac_get_prob(c, 255) << 16;
2198 val += (vp56_rac_get_prob(c, 255) << 15);
2199 val += (vp56_rac_get_prob(c, 255) << 14);
2201 val += (vp56_rac_get_prob(c, 254) << 13);
2202 val += (vp56_rac_get_prob(c, 254) << 12);
2203 val += (vp56_rac_get_prob(c, 254) << 11);
2204 val += (vp56_rac_get_prob(c, 252) << 10);
2205 val += (vp56_rac_get_prob(c, 249) << 9);
2206 val += (vp56_rac_get_prob(c, 243) << 8);
2207 val += (vp56_rac_get_prob(c, 230) << 7);
2208 val += (vp56_rac_get_prob(c, 196) << 6);
2209 val += (vp56_rac_get_prob(c, 177) << 5);
2210 val += (vp56_rac_get_prob(c, 153) << 4);
2211 val += (vp56_rac_get_prob(c, 140) << 3);
2212 val += (vp56_rac_get_prob(c, 133) << 2);
2213 val += (vp56_rac_get_prob(c, 130) << 1);
2214 val += vp56_rac_get_prob(c, 129);
2218 #define STORE_COEF(c, i, v) do { \
2219 if (is8bitsperpixel) { \
2222 AV_WN32A(&c[i * 2], v); \
2226 band_left = band_counts[++band];
2228 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2230 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2231 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2233 } while (++i < n_coeffs);
2238 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2239 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2240 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2241 const int16_t (*nb)[2], const int16_t *band_counts,
2242 const int16_t *qmul)
2244 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2245 nnz, scan, nb, band_counts, qmul);
2248 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2249 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2250 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2251 const int16_t (*nb)[2], const int16_t *band_counts,
2252 const int16_t *qmul)
2254 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2255 nnz, scan, nb, band_counts, qmul);
2258 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2259 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2260 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2261 const int16_t (*nb)[2], const int16_t *band_counts,
2262 const int16_t *qmul)
2264 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2265 nnz, scan, nb, band_counts, qmul);
2268 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2269 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2270 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2271 const int16_t (*nb)[2], const int16_t *band_counts,
2272 const int16_t *qmul)
2274 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2275 nnz, scan, nb, band_counts, qmul);
2278 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2280 VP9Context *s = ctx->priv_data;
2282 int row = s->row, col = s->col;
2283 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2284 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2285 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2286 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2287 int end_x = FFMIN(2 * (s->cols - col), w4);
2288 int end_y = FFMIN(2 * (s->rows - row), h4);
2289 int n, pl, x, y, res;
2290 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2291 int tx = 4 * s->s.h.lossless + b->tx;
2292 const int16_t * const *yscans = vp9_scans[tx];
2293 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2294 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2295 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2296 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2297 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2298 static const int16_t band_counts[4][8] = {
2299 { 1, 2, 3, 4, 3, 16 - 13 },
2300 { 1, 2, 3, 4, 11, 64 - 21 },
2301 { 1, 2, 3, 4, 11, 256 - 21 },
2302 { 1, 2, 3, 4, 11, 1024 - 21 },
2304 const int16_t *y_band_counts = band_counts[b->tx];
2305 const int16_t *uv_band_counts = band_counts[b->uvtx];
2306 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2307 int total_coeff = 0;
2309 #define MERGE(la, end, step, rd) \
2310 for (n = 0; n < end; n += step) \
2311 la[n] = !!rd(&la[n])
2312 #define MERGE_CTX(step, rd) \
2314 MERGE(l, end_y, step, rd); \
2315 MERGE(a, end_x, step, rd); \
2318 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2319 for (n = 0, y = 0; y < end_y; y += step) { \
2320 for (x = 0; x < end_x; x += step, n += step * step) { \
2321 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2322 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2323 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2324 c, e, p, a[x] + l[y], yscans[txtp], \
2325 ynbs[txtp], y_band_counts, qmul[0]); \
2326 a[x] = l[y] = !!res; \
2327 total_coeff |= !!res; \
2329 AV_WN16A(&s->eob[n], res); \
2336 #define SPLAT(la, end, step, cond) \
2338 for (n = 1; n < end; n += step) \
2339 la[n] = la[n - 1]; \
2340 } else if (step == 4) { \
2342 for (n = 0; n < end; n += step) \
2343 AV_WN32A(&la[n], la[n] * 0x01010101); \
2345 for (n = 0; n < end; n += step) \
2346 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2348 } else /* step == 8 */ { \
2350 if (HAVE_FAST_64BIT) { \
2351 for (n = 0; n < end; n += step) \
2352 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2354 for (n = 0; n < end; n += step) { \
2355 uint32_t v32 = la[n] * 0x01010101; \
2356 AV_WN32A(&la[n], v32); \
2357 AV_WN32A(&la[n + 4], v32); \
2361 for (n = 0; n < end; n += step) \
2362 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2365 #define SPLAT_CTX(step) \
2367 SPLAT(a, end_x, step, end_x == w4); \
2368 SPLAT(l, end_y, step, end_y == h4); \
2374 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2377 MERGE_CTX(2, AV_RN16A);
2378 DECODE_Y_COEF_LOOP(2, 0,);
2382 MERGE_CTX(4, AV_RN32A);
2383 DECODE_Y_COEF_LOOP(4, 0,);
2387 MERGE_CTX(8, AV_RN64A);
2388 DECODE_Y_COEF_LOOP(8, 0, 32);
2393 #define DECODE_UV_COEF_LOOP(step, v) \
2394 for (n = 0, y = 0; y < end_y; y += step) { \
2395 for (x = 0; x < end_x; x += step, n += step * step) { \
2396 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2397 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2398 16 * step * step, c, e, p, a[x] + l[y], \
2399 uvscan, uvnb, uv_band_counts, qmul[1]); \
2400 a[x] = l[y] = !!res; \
2401 total_coeff |= !!res; \
2403 AV_WN16A(&s->uveob[pl][n], res); \
2405 s->uveob[pl][n] = res; \
2410 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2411 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2412 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2417 for (pl = 0; pl < 2; pl++) {
2418 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2419 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2422 DECODE_UV_COEF_LOOP(1,);
2425 MERGE_CTX(2, AV_RN16A);
2426 DECODE_UV_COEF_LOOP(2,);
2430 MERGE_CTX(4, AV_RN32A);
2431 DECODE_UV_COEF_LOOP(4,);
2435 MERGE_CTX(8, AV_RN64A);
2436 DECODE_UV_COEF_LOOP(8, 32);
2445 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2447 return decode_coeffs(ctx, 1);
2450 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2452 return decode_coeffs(ctx, 0);
2455 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2456 uint8_t *dst_edge, ptrdiff_t stride_edge,
2457 uint8_t *dst_inner, ptrdiff_t stride_inner,
2458 uint8_t *l, int col, int x, int w,
2459 int row, int y, enum TxfmMode tx,
2460 int p, int ss_h, int ss_v, int bytesperpixel)
2462 int have_top = row > 0 || y > 0;
2463 int have_left = col > s->tile_col_start || x > 0;
2464 int have_right = x < w - 1;
2466 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2467 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2468 { DC_127_PRED, VERT_PRED } },
2469 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2470 { HOR_PRED, HOR_PRED } },
2471 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2472 { LEFT_DC_PRED, DC_PRED } },
2473 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2474 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2475 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2476 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2477 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2478 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2479 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2480 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2481 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2482 { DC_127_PRED, VERT_LEFT_PRED } },
2483 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2484 { HOR_UP_PRED, HOR_UP_PRED } },
2485 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2486 { HOR_PRED, TM_VP8_PRED } },
2488 static const struct {
2489 uint8_t needs_left:1;
2490 uint8_t needs_top:1;
2491 uint8_t needs_topleft:1;
2492 uint8_t needs_topright:1;
2493 uint8_t invert_left:1;
2494 } edges[N_INTRA_PRED_MODES] = {
2495 [VERT_PRED] = { .needs_top = 1 },
2496 [HOR_PRED] = { .needs_left = 1 },
2497 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2498 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2499 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2500 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2501 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2502 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2503 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2504 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2505 [LEFT_DC_PRED] = { .needs_left = 1 },
2506 [TOP_DC_PRED] = { .needs_top = 1 },
2507 [DC_128_PRED] = { 0 },
2508 [DC_127_PRED] = { 0 },
2509 [DC_129_PRED] = { 0 }
2512 av_assert2(mode >= 0 && mode < 10);
2513 mode = mode_conv[mode][have_left][have_top];
2514 if (edges[mode].needs_top) {
2515 uint8_t *top, *topleft;
2516 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2517 int n_px_need_tr = 0;
2519 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2522 // if top of sb64-row, use s->intra_pred_data[] instead of
2523 // dst[-stride] for intra prediction (it contains pre- instead of
2524 // post-loopfilter data)
2526 top = !(row & 7) && !y ?
2527 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2528 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2530 topleft = !(row & 7) && !y ?
2531 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2532 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2533 &dst_inner[-stride_inner];
2537 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2538 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2539 n_px_need + n_px_need_tr <= n_px_have) {
2543 if (n_px_need <= n_px_have) {
2544 memcpy(*a, top, n_px_need * bytesperpixel);
2546 #define memset_bpp(c, i1, v, i2, num) do { \
2547 if (bytesperpixel == 1) { \
2548 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2550 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2551 for (n = 0; n < (num); n++) { \
2552 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2556 memcpy(*a, top, n_px_have * bytesperpixel);
2557 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2560 #define memset_val(c, val, num) do { \
2561 if (bytesperpixel == 1) { \
2562 memset((c), (val), (num)); \
2565 for (n = 0; n < (num); n++) { \
2566 AV_WN16A(&(c)[n * 2], (val)); \
2570 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2572 if (edges[mode].needs_topleft) {
2573 if (have_left && have_top) {
2574 #define assign_bpp(c, i1, v, i2) do { \
2575 if (bytesperpixel == 1) { \
2576 (c)[(i1)] = (v)[(i2)]; \
2578 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2581 assign_bpp(*a, -1, topleft, -1);
2583 #define assign_val(c, i, v) do { \
2584 if (bytesperpixel == 1) { \
2587 AV_WN16A(&(c)[(i) * 2], (v)); \
2590 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2593 if (tx == TX_4X4 && edges[mode].needs_topright) {
2594 if (have_top && have_right &&
2595 n_px_need + n_px_need_tr <= n_px_have) {
2596 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2598 memset_bpp(*a, 4, *a, 3, 4);
2603 if (edges[mode].needs_left) {
2605 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2606 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2607 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2609 if (edges[mode].invert_left) {
2610 if (n_px_need <= n_px_have) {
2611 for (i = 0; i < n_px_need; i++)
2612 assign_bpp(l, i, &dst[i * stride], -1);
2614 for (i = 0; i < n_px_have; i++)
2615 assign_bpp(l, i, &dst[i * stride], -1);
2616 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2619 if (n_px_need <= n_px_have) {
2620 for (i = 0; i < n_px_need; i++)
2621 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2623 for (i = 0; i < n_px_have; i++)
2624 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2625 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2629 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2636 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2637 ptrdiff_t uv_off, int bytesperpixel)
2639 VP9Context *s = ctx->priv_data;
2641 int row = s->row, col = s->col;
2642 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2643 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2644 int end_x = FFMIN(2 * (s->cols - col), w4);
2645 int end_y = FFMIN(2 * (s->rows - row), h4);
2646 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2647 int uvstep1d = 1 << b->uvtx, p;
2648 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2649 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2650 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2652 for (n = 0, y = 0; y < end_y; y += step1d) {
2653 uint8_t *ptr = dst, *ptr_r = dst_r;
2654 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2655 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2656 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2658 uint8_t *a = &a_buf[32];
2659 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2660 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2662 mode = check_intra_mode(s, mode, &a, ptr_r,
2663 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2664 ptr, s->y_stride, l,
2665 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2666 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2668 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2669 s->block + 16 * n * bytesperpixel, eob);
2671 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2672 dst += 4 * step1d * s->y_stride;
2679 step = 1 << (b->uvtx * 2);
2680 for (p = 0; p < 2; p++) {
2681 dst = s->dst[1 + p];
2682 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2683 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2684 uint8_t *ptr = dst, *ptr_r = dst_r;
2685 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2686 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2687 int mode = b->uvmode;
2688 uint8_t *a = &a_buf[32];
2689 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2691 mode = check_intra_mode(s, mode, &a, ptr_r,
2692 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2693 ptr, s->uv_stride, l, col, x, w4, row, y,
2694 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2695 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2697 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2698 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2700 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2701 dst += 4 * uvstep1d * s->uv_stride;
2706 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2708 intra_recon(ctx, y_off, uv_off, 1);
2711 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2713 intra_recon(ctx, y_off, uv_off, 2);
2716 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2717 uint8_t *dst, ptrdiff_t dst_stride,
2718 const uint8_t *ref, ptrdiff_t ref_stride,
2719 ThreadFrame *ref_frame,
2720 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2721 int bw, int bh, int w, int h, int bytesperpixel)
2723 int mx = mv->x, my = mv->y, th;
2727 ref += y * ref_stride + x * bytesperpixel;
2730 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2731 // we use +7 because the last 7 pixels of each sbrow can be changed in
2732 // the longest loopfilter of the next sbrow
2733 th = (y + bh + 4 * !!my + 7) >> 6;
2734 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2735 if (x < !!mx * 3 || y < !!my * 3 ||
2736 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2737 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2738 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2740 bw + !!mx * 7, bh + !!my * 7,
2741 x - !!mx * 3, y - !!my * 3, w, h);
2742 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2745 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2748 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2749 uint8_t *dst_u, uint8_t *dst_v,
2750 ptrdiff_t dst_stride,
2751 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2752 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2753 ThreadFrame *ref_frame,
2754 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2755 int bw, int bh, int w, int h, int bytesperpixel)
2757 int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2761 ref_u += y * src_stride_u + x * bytesperpixel;
2762 ref_v += y * src_stride_v + x * bytesperpixel;
2765 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2766 // we use +7 because the last 7 pixels of each sbrow can be changed in
2767 // the longest loopfilter of the next sbrow
2768 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2769 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2770 if (x < !!mx * 3 || y < !!my * 3 ||
2771 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2772 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2773 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2775 bw + !!mx * 7, bh + !!my * 7,
2776 x - !!mx * 3, y - !!my * 3, w, h);
2777 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2778 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2780 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2781 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2783 bw + !!mx * 7, bh + !!my * 7,
2784 x - !!mx * 3, y - !!my * 3, w, h);
2785 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2786 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2788 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2789 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2793 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2794 px, py, pw, ph, bw, bh, w, h, i) \
2795 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2796 mv, bw, bh, w, h, bytesperpixel)
2797 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2798 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2799 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2800 row, col, mv, bw, bh, w, h, bytesperpixel)
2802 #define FN(x) x##_8bpp
2803 #define BYTES_PER_PIXEL 1
2804 #include "vp9_mc_template.c"
2806 #undef BYTES_PER_PIXEL
2807 #define FN(x) x##_16bpp
2808 #define BYTES_PER_PIXEL 2
2809 #include "vp9_mc_template.c"
2811 #undef mc_chroma_dir
2813 #undef BYTES_PER_PIXEL
2816 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2817 vp9_mc_func (*mc)[2],
2818 uint8_t *dst, ptrdiff_t dst_stride,
2819 const uint8_t *ref, ptrdiff_t ref_stride,
2820 ThreadFrame *ref_frame,
2821 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2822 int px, int py, int pw, int ph,
2823 int bw, int bh, int w, int h, int bytesperpixel,
2824 const uint16_t *scale, const uint8_t *step)
2826 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2827 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2828 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2829 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2831 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2833 int refbw_m1, refbh_m1;
2837 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2838 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2839 // BUG libvpx seems to scale the two components separately. This introduces
2840 // rounding errors but we have to reproduce them to be exactly compatible
2841 // with the output from libvpx...
2842 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2843 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2847 ref += y * ref_stride + x * bytesperpixel;
2850 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2851 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2852 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2853 // we use +7 because the last 7 pixels of each sbrow can be changed in
2854 // the longest loopfilter of the next sbrow
2855 th = (y + refbh_m1 + 4 + 7) >> 6;
2856 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2857 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2858 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2859 ref - 3 * ref_stride - 3 * bytesperpixel,
2861 refbw_m1 + 8, refbh_m1 + 8,
2862 x - 3, y - 3, w, h);
2863 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2866 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2870 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2871 vp9_mc_func (*mc)[2],
2872 uint8_t *dst_u, uint8_t *dst_v,
2873 ptrdiff_t dst_stride,
2874 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2875 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2876 ThreadFrame *ref_frame,
2877 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2878 int px, int py, int pw, int ph,
2879 int bw, int bh, int w, int h, int bytesperpixel,
2880 const uint16_t *scale, const uint8_t *step)
2882 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2883 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2884 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2885 ref_v, src_stride_v, ref_frame,
2886 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2889 int refbw_m1, refbh_m1;
2894 // BUG https://code.google.com/p/webm/issues/detail?id=820
2895 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2896 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2898 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2899 mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2902 // BUG https://code.google.com/p/webm/issues/detail?id=820
2903 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2904 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2906 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2907 my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2912 ref_u += y * src_stride_u + x * bytesperpixel;
2913 ref_v += y * src_stride_v + x * bytesperpixel;
2916 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2917 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2918 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2919 // we use +7 because the last 7 pixels of each sbrow can be changed in
2920 // the longest loopfilter of the next sbrow
2921 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2922 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2923 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2924 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2925 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2927 refbw_m1 + 8, refbh_m1 + 8,
2928 x - 3, y - 3, w, h);
2929 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2930 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2932 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2933 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2935 refbw_m1 + 8, refbh_m1 + 8,
2936 x - 3, y - 3, w, h);
2937 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2938 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2940 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2941 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2946 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2947 px, py, pw, ph, bw, bh, w, h, i) \
2948 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2949 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2950 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2951 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2952 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2953 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2954 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2955 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2957 #define FN(x) x##_scaled_8bpp
2958 #define BYTES_PER_PIXEL 1
2959 #include "vp9_mc_template.c"
2961 #undef BYTES_PER_PIXEL
2962 #define FN(x) x##_scaled_16bpp
2963 #define BYTES_PER_PIXEL 2
2964 #include "vp9_mc_template.c"
2966 #undef mc_chroma_dir
2968 #undef BYTES_PER_PIXEL
2971 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2973 VP9Context *s = ctx->priv_data;
2975 int row = s->row, col = s->col;
2977 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2978 if (bytesperpixel == 1) {
2979 inter_pred_scaled_8bpp(ctx);
2981 inter_pred_scaled_16bpp(ctx);
2984 if (bytesperpixel == 1) {
2985 inter_pred_8bpp(ctx);
2987 inter_pred_16bpp(ctx);
2991 /* mostly copied intra_recon() */
2993 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2994 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2995 int end_x = FFMIN(2 * (s->cols - col), w4);
2996 int end_y = FFMIN(2 * (s->rows - row), h4);
2997 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2998 int uvstep1d = 1 << b->uvtx, p;
2999 uint8_t *dst = s->dst[0];
3002 for (n = 0, y = 0; y < end_y; y += step1d) {
3004 for (x = 0; x < end_x; x += step1d,
3005 ptr += 4 * step1d * bytesperpixel, n += step) {
3006 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3009 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3010 s->block + 16 * n * bytesperpixel, eob);
3012 dst += 4 * s->y_stride * step1d;
3018 step = 1 << (b->uvtx * 2);
3019 for (p = 0; p < 2; p++) {
3020 dst = s->dst[p + 1];
3021 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3023 for (x = 0; x < end_x; x += uvstep1d,
3024 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3025 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3028 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3029 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3031 dst += 4 * uvstep1d * s->uv_stride;
3037 static void inter_recon_8bpp(AVCodecContext *ctx)
3039 inter_recon(ctx, 1);
3042 static void inter_recon_16bpp(AVCodecContext *ctx)
3044 inter_recon(ctx, 2);
3047 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3048 int row_and_7, int col_and_7,
3049 int w, int h, int col_end, int row_end,
3050 enum TxfmMode tx, int skip_inter)
3052 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3053 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3055 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3056 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3057 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3058 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3060 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3061 // edges. This means that for UV, we work on two subsampled blocks at
3062 // a time, and we only use the topleft block's mode information to set
3063 // things like block strength. Thus, for any block size smaller than
3064 // 16x16, ignore the odd portion of the block.
3065 if (tx == TX_4X4 && (ss_v | ss_h)) {
3080 if (tx == TX_4X4 && !skip_inter) {
3081 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3082 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3083 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3085 for (y = row_and_7; y < h + row_and_7; y++) {
3086 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3088 mask[0][y][1] |= m_row_8;
3089 mask[0][y][2] |= m_row_4;
3090 // for odd lines, if the odd col is not being filtered,
3091 // skip odd row also:
3098 // if a/c are even row/col and b/d are odd, and d is skipped,
3099 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3100 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3101 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3103 mask[1][y][col_mask_id] |= m_col;
3106 mask[0][y][3] |= m_col;
3108 if (ss_h && (col_end & 1))
3109 mask[1][y][3] |= (t << (w - 1)) - t;
3111 mask[1][y][3] |= m_col;
3115 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3118 int mask_id = (tx == TX_8X8);
3119 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3120 int l2 = tx + ss_h - 1, step1d;
3121 int m_row = m_col & masks[l2];
3123 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3124 // 8wd loopfilter to prevent going off the visible edge.
3125 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3126 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3127 int m_row_8 = m_row - m_row_16;
3129 for (y = row_and_7; y < h + row_and_7; y++) {
3130 mask[0][y][0] |= m_row_16;
3131 mask[0][y][1] |= m_row_8;
3134 for (y = row_and_7; y < h + row_and_7; y++)
3135 mask[0][y][mask_id] |= m_row;
3140 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3141 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3142 mask[1][y][0] |= m_col;
3143 if (y - row_and_7 == h - 1)
3144 mask[1][y][1] |= m_col;
3146 for (y = row_and_7; y < h + row_and_7; y += step1d)
3147 mask[1][y][mask_id] |= m_col;
3149 } else if (tx != TX_4X4) {
3152 mask_id = (tx == TX_8X8) || (h == ss_v);
3153 mask[1][row_and_7][mask_id] |= m_col;
3154 mask_id = (tx == TX_8X8) || (w == ss_h);
3155 for (y = row_and_7; y < h + row_and_7; y++)
3156 mask[0][y][mask_id] |= t;
3158 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3160 for (y = row_and_7; y < h + row_and_7; y++) {
3161 mask[0][y][2] |= t4;
3162 mask[0][y][1] |= t8;
3164 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3169 static void decode_b(AVCodecContext *ctx, int row, int col,
3170 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3171 enum BlockLevel bl, enum BlockPartition bp)
3173 VP9Context *s = ctx->priv_data;
3175 enum BlockSize bs = bl * 3 + bp;
3176 int bytesperpixel = s->bytesperpixel;
3177 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3179 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3185 s->min_mv.x = -(128 + col * 64);
3186 s->min_mv.y = -(128 + row * 64);
3187 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3188 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3194 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3195 (s->ss_v && h4 * 2 == (1 << b->tx)));
3200 if (bytesperpixel == 1) {
3201 has_coeffs = decode_coeffs_8bpp(ctx);
3203 has_coeffs = decode_coeffs_16bpp(ctx);
3205 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3207 memset(&s->above_skip_ctx[col], 1, w4);
3208 memset(&s->left_skip_ctx[s->row7], 1, h4);
3213 #define SPLAT_ZERO_CTX(v, n) \
3215 case 1: v = 0; break; \
3216 case 2: AV_ZERO16(&v); break; \
3217 case 4: AV_ZERO32(&v); break; \
3218 case 8: AV_ZERO64(&v); break; \
3219 case 16: AV_ZERO128(&v); break; \
3221 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3223 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3224 if (s->ss_##dir2) { \
3225 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3226 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3228 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3229 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3234 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3235 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3236 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3237 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3240 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3241 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3242 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3243 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3249 s->block += w4 * h4 * 64 * bytesperpixel;
3250 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3251 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3252 s->eob += 4 * w4 * h4;
3253 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3254 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3260 // emulated overhangs if the stride of the target buffer can't hold. This
3261 // makes it possible to support emu-edge and so on even if we have large block
3263 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3264 (row + h4) > s->rows;
3265 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3266 (row + h4) > s->rows;
3268 s->dst[0] = s->tmp_y;
3271 s->dst[0] = f->data[0] + yoff;
3272 s->y_stride = f->linesize[0];
3275 s->dst[1] = s->tmp_uv[0];
3276 s->dst[2] = s->tmp_uv[1];
3279 s->dst[1] = f->data[1] + uvoff;
3280 s->dst[2] = f->data[2] + uvoff;
3281 s->uv_stride = f->linesize[1];
3285 intra_recon_16bpp(ctx, yoff, uvoff);
3287 intra_recon_8bpp(ctx, yoff, uvoff);
3291 inter_recon_16bpp(ctx);
3293 inter_recon_8bpp(ctx);
3297 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3299 for (n = 0; o < w; n++) {
3304 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3305 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3311 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3312 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3314 for (n = s->ss_h; o < w; n++) {
3319 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3320 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3321 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3322 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3328 // pick filter level and find edges to apply filter to
3329 if (s->s.h.filter.level &&
3330 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3331 [b->mode[3] != ZEROMV]) > 0) {
3332 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3333 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3335 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3336 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3337 if (s->ss_h || s->ss_v)
3338 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3339 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3340 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3341 b->uvtx, skip_inter);
3343 if (!s->filter_lut.lim_lut[lvl]) {
3344 int sharp = s->s.h.filter.sharpness;
3348 limit >>= (sharp + 3) >> 2;
3349 limit = FFMIN(limit, 9 - sharp);
3351 limit = FFMAX(limit, 1);
3353 s->filter_lut.lim_lut[lvl] = limit;
3354 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3360 s->block += w4 * h4 * 64 * bytesperpixel;
3361 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3362 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3363 s->eob += 4 * w4 * h4;
3364 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3365 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3369 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3370 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3372 VP9Context *s = ctx->priv_data;
3373 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3374 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3375 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3376 s->prob.p.partition[bl][c];
3377 enum BlockPartition bp;
3378 ptrdiff_t hbs = 4 >> bl;
3379 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3380 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3381 int bytesperpixel = s->bytesperpixel;
3384 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3385 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3386 } else if (col + hbs < s->cols) { // FIXME why not <=?
3387 if (row + hbs < s->rows) { // FIXME why not <=?
3388 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3390 case PARTITION_NONE:
3391 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3394 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3395 yoff += hbs * 8 * y_stride;
3396 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3397 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3400 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3401 yoff += hbs * 8 * bytesperpixel;
3402 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3403 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3405 case PARTITION_SPLIT:
3406 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3407 decode_sb(ctx, row, col + hbs, lflvl,
3408 yoff + 8 * hbs * bytesperpixel,
3409 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3410 yoff += hbs * 8 * y_stride;
3411 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3412 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3413 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3414 yoff + 8 * hbs * bytesperpixel,
3415 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3420 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3421 bp = PARTITION_SPLIT;
3422 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3423 decode_sb(ctx, row, col + hbs, lflvl,
3424 yoff + 8 * hbs * bytesperpixel,
3425 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3428 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3430 } else if (row + hbs < s->rows) { // FIXME why not <=?
3431 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3432 bp = PARTITION_SPLIT;
3433 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3434 yoff += hbs * 8 * y_stride;
3435 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3436 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3439 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3442 bp = PARTITION_SPLIT;
3443 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3445 s->counts.partition[bl][c][bp]++;
3448 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3449 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3451 VP9Context *s = ctx->priv_data;
3453 ptrdiff_t hbs = 4 >> bl;
3454 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3455 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3456 int bytesperpixel = s->bytesperpixel;
3459 av_assert2(b->bl == BL_8X8);
3460 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3461 } else if (s->b->bl == bl) {
3462 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3463 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3464 yoff += hbs * 8 * y_stride;
3465 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3466 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3467 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3468 yoff += hbs * 8 * bytesperpixel;
3469 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3470 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3473 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3474 if (col + hbs < s->cols) { // FIXME why not <=?
3475 if (row + hbs < s->rows) {
3476 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3477 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3478 yoff += hbs * 8 * y_stride;
3479 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3480 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3481 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3482 yoff + 8 * hbs * bytesperpixel,
3483 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3485 yoff += hbs * 8 * bytesperpixel;
3486 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3487 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3489 } else if (row + hbs < s->rows) {
3490 yoff += hbs * 8 * y_stride;
3491 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3492 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3497 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3498 uint8_t *lvl, uint8_t (*mask)[4],
3499 uint8_t *dst, ptrdiff_t ls)
3501 int y, x, bytesperpixel = s->bytesperpixel;
3503 // filter edges between columns (e.g. block1 | block2)
3504 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3505 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3506 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3507 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3508 unsigned hm = hm1 | hm2 | hm13 | hm23;
3510 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3513 int L = *l, H = L >> 4;
3514 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3516 if (hmask1[0] & x) {
3517 if (hmask2[0] & x) {
3518 av_assert2(l[8 << ss_v] == L);
3519 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3521 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3523 } else if (hm2 & x) {
3526 E |= s->filter_lut.mblim_lut[L] << 8;
3527 I |= s->filter_lut.lim_lut[L] << 8;
3528 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3530 [0](ptr, ls, E, I, H);
3532 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3533 [0](ptr, ls, E, I, H);
3535 } else if (hm2 & x) {
3536 int L = l[8 << ss_v], H = L >> 4;
3537 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3539 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3540 [0](ptr + 8 * ls, ls, E, I, H);
3548 int L = *l, H = L >> 4;
3549 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3554 E |= s->filter_lut.mblim_lut[L] << 8;
3555 I |= s->filter_lut.lim_lut[L] << 8;
3556 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3558 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3560 } else if (hm23 & x) {
3561 int L = l[8 << ss_v], H = L >> 4;
3562 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3564 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3572 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3573 uint8_t *lvl, uint8_t (*mask)[4],
3574 uint8_t *dst, ptrdiff_t ls)
3576 int y, x, bytesperpixel = s->bytesperpixel;
3579 // filter edges between rows (e.g. ------)
3581 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3582 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3583 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3585 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3588 int L = *l, H = L >> 4;
3589 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3592 if (vmask[0] & (x << (1 + ss_h))) {
3593 av_assert2(l[1 + ss_h] == L);
3594 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3596 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3598 } else if (vm & (x << (1 + ss_h))) {
3601 E |= s->filter_lut.mblim_lut[L] << 8;
3602 I |= s->filter_lut.lim_lut[L] << 8;
3603 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3604 [!!(vmask[1] & (x << (1 + ss_h)))]
3605 [1](ptr, ls, E, I, H);
3607 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3608 [1](ptr, ls, E, I, H);
3610 } else if (vm & (x << (1 + ss_h))) {
3611 int L = l[1 + ss_h], H = L >> 4;
3612 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3614 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3615 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3620 int L = *l, H = L >> 4;
3621 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3623 if (vm3 & (x << (1 + ss_h))) {
3626 E |= s->filter_lut.mblim_lut[L] << 8;
3627 I |= s->filter_lut.lim_lut[L] << 8;
3628 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3630 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3632 } else if (vm3 & (x << (1 + ss_h))) {
3633 int L = l[1 + ss_h], H = L >> 4;
3634 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3636 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3649 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3650 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3652 VP9Context *s = ctx->priv_data;
3653 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3654 uint8_t *dst = f->data[0] + yoff;
3655 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3656 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3659 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3660 // if you think of them as acting on a 8x8 block max, we can interleave
3661 // each v/h within the single x loop, but that only works if we work on
3662 // 8 pixel blocks, and we won't always do that (we want at least 16px
3663 // to use SSE2 optimizations, perhaps 32 for AVX2)
3665 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3666 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3668 for (p = 0; p < 2; p++) {
3669 dst = f->data[1 + p] + uvoff;
3670 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3671 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3675 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3677 int sb_start = ( idx * n) >> log2_n;
3678 int sb_end = ((idx + 1) * n) >> log2_n;
3679 *start = FFMIN(sb_start, n) << 3;
3680 *end = FFMIN(sb_end, n) << 3;
3683 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3684 int max_count, int update_factor)
3686 unsigned ct = ct0 + ct1, p2, p1;
3692 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3693 p2 = av_clip(p2, 1, 255);
3694 ct = FFMIN(ct, max_count);
3695 update_factor = FASTDIV(update_factor * ct, max_count);
3697 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3698 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3701 static void adapt_probs(VP9Context *s)
3704 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3705 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3708 for (i = 0; i < 4; i++)
3709 for (j = 0; j < 2; j++)
3710 for (k = 0; k < 2; k++)
3711 for (l = 0; l < 6; l++)
3712 for (m = 0; m < 6; m++) {
3713 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3714 unsigned *e = s->counts.eob[i][j][k][l][m];
3715 unsigned *c = s->counts.coef[i][j][k][l][m];
3717 if (l == 0 && m >= 3) // dc only has 3 pt
3720 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3721 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3722 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3725 if (s->s.h.keyframe || s->s.h.intraonly) {
3726 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3727 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3728 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3729 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3734 for (i = 0; i < 3; i++)
3735 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3738 for (i = 0; i < 4; i++)
3739 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3742 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3743 for (i = 0; i < 5; i++)
3744 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3748 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3749 for (i = 0; i < 5; i++)
3750 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3751 s->counts.comp_ref[i][1], 20, 128);
3754 if (s->s.h.comppredmode != PRED_COMPREF) {
3755 for (i = 0; i < 5; i++) {
3756 uint8_t *pp = p->single_ref[i];
3757 unsigned (*c)[2] = s->counts.single_ref[i];
3759 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3760 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3764 // block partitioning
3765 for (i = 0; i < 4; i++)
3766 for (j = 0; j < 4; j++) {
3767 uint8_t *pp = p->partition[i][j];
3768 unsigned *c = s->counts.partition[i][j];
3770 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3771 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3772 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3776 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3777 for (i = 0; i < 2; i++) {
3778 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3780 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3781 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3782 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3783 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3784 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3785 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3789 // interpolation filter
3790 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3791 for (i = 0; i < 4; i++) {
3792 uint8_t *pp = p->filter[i];
3793 unsigned *c = s->counts.filter[i];
3795 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3796 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3801 for (i = 0; i < 7; i++) {
3802 uint8_t *pp = p->mv_mode[i];
3803 unsigned *c = s->counts.mv_mode[i];
3805 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3806 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3807 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3812 uint8_t *pp = p->mv_joint;
3813 unsigned *c = s->counts.mv_joint;
3815 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3816 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3817 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3821 for (i = 0; i < 2; i++) {
3823 unsigned *c, (*c2)[2], sum;
3825 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3826 s->counts.mv_comp[i].sign[1], 20, 128);
3828 pp = p->mv_comp[i].classes;
3829 c = s->counts.mv_comp[i].classes;
3830 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3831 adapt_prob(&pp[0], c[0], sum, 20, 128);
3833 adapt_prob(&pp[1], c[1], sum, 20, 128);
3835 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3836 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3838 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3839 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3841 adapt_prob(&pp[6], c[6], sum, 20, 128);
3842 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3843 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3844 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3846 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3847 s->counts.mv_comp[i].class0[1], 20, 128);
3848 pp = p->mv_comp[i].bits;
3849 c2 = s->counts.mv_comp[i].bits;
3850 for (j = 0; j < 10; j++)
3851 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3853 for (j = 0; j < 2; j++) {
3854 pp = p->mv_comp[i].class0_fp[j];
3855 c = s->counts.mv_comp[i].class0_fp[j];
3856 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3857 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3858 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3860 pp = p->mv_comp[i].fp;
3861 c = s->counts.mv_comp[i].fp;
3862 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3863 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3864 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3866 if (s->s.h.highprecisionmvs) {
3867 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3868 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3869 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3870 s->counts.mv_comp[i].hp[1], 20, 128);
3875 for (i = 0; i < 4; i++) {
3876 uint8_t *pp = p->y_mode[i];
3877 unsigned *c = s->counts.y_mode[i], sum, s2;
3879 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3880 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3881 sum -= c[TM_VP8_PRED];
3882 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3883 sum -= c[VERT_PRED];
3884 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3885 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3887 adapt_prob(&pp[3], s2, sum, 20, 128);
3889 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3890 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3891 sum -= c[DIAG_DOWN_LEFT_PRED];
3892 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3893 sum -= c[VERT_LEFT_PRED];
3894 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3895 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3899 for (i = 0; i < 10; i++) {
3900 uint8_t *pp = p->uv_mode[i];
3901 unsigned *c = s->counts.uv_mode[i], sum, s2;
3903 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3904 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3905 sum -= c[TM_VP8_PRED];
3906 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3907 sum -= c[VERT_PRED];
3908 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3909 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3911 adapt_prob(&pp[3], s2, sum, 20, 128);
3913 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3914 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3915 sum -= c[DIAG_DOWN_LEFT_PRED];
3916 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3917 sum -= c[VERT_LEFT_PRED];
3918 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3919 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3923 static void free_buffers(VP9Context *s)
3925 av_freep(&s->intra_pred_data[0]);
3926 av_freep(&s->b_base);
3927 av_freep(&s->block_base);
3930 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3932 VP9Context *s = ctx->priv_data;
3935 for (i = 0; i < 3; i++) {
3936 if (s->s.frames[i].tf.f->buf[0])
3937 vp9_unref_frame(ctx, &s->s.frames[i]);
3938 av_frame_free(&s->s.frames[i].tf.f);
3940 for (i = 0; i < 8; i++) {
3941 if (s->s.refs[i].f->buf[0])
3942 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3943 av_frame_free(&s->s.refs[i].f);
3944 if (s->next_refs[i].f->buf[0])
3945 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3946 av_frame_free(&s->next_refs[i].f);
3956 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3957 int *got_frame, AVPacket *pkt)
3959 const uint8_t *data = pkt->data;
3960 int size = pkt->size;
3961 VP9Context *s = ctx->priv_data;
3962 int res, tile_row, tile_col, i, ref, row, col;
3963 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3964 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3965 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3969 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3971 } else if (res == 0) {
3972 if (!s->s.refs[ref].f->buf[0]) {
3973 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3974 return AVERROR_INVALIDDATA;
3976 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3978 ((AVFrame *)frame)->pkt_pts = pkt->pts;
3979 ((AVFrame *)frame)->pkt_dts = pkt->dts;
3980 for (i = 0; i < 8; i++) {
3981 if (s->next_refs[i].f->buf[0])
3982 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3983 if (s->s.refs[i].f->buf[0] &&
3984 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
3993 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
3994 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
3995 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
3996 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
3997 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4000 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4001 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
4002 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4003 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4005 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4006 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4007 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4009 f = s->s.frames[CUR_FRAME].tf.f;
4010 f->key_frame = s->s.h.keyframe;
4011 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4012 ls_y = f->linesize[0];
4013 ls_uv =f->linesize[1];
4015 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4016 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4017 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4018 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4022 for (i = 0; i < 8; i++) {
4023 if (s->next_refs[i].f->buf[0])
4024 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4025 if (s->s.h.refreshrefmask & (1 << i)) {
4026 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4027 } else if (s->s.refs[i].f->buf[0]) {
4028 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4035 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4038 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4041 res = ctx->hwaccel->end_frame(ctx);
4047 // main tile decode loop
4048 bytesperpixel = s->bytesperpixel;
4049 memset(s->above_partition_ctx, 0, s->cols);
4050 memset(s->above_skip_ctx, 0, s->cols);
4051 if (s->s.h.keyframe || s->s.h.intraonly) {
4052 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4054 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4056 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4057 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4058 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4059 memset(s->above_segpred_ctx, 0, s->cols);
4060 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4061 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4062 if ((res = update_block_buffers(ctx)) < 0) {
4063 av_log(ctx, AV_LOG_ERROR,
4064 "Failed to allocate block buffers\n");
4067 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4070 for (i = 0; i < 4; i++) {
4071 for (j = 0; j < 2; j++)
4072 for (k = 0; k < 2; k++)
4073 for (l = 0; l < 6; l++)
4074 for (m = 0; m < 6; m++)
4075 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4076 s->prob.coef[i][j][k][l][m], 3);
4077 if (s->s.h.txfmmode == i)
4080 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4081 ff_thread_finish_setup(ctx);
4082 } else if (!s->s.h.refreshctx) {
4083 ff_thread_finish_setup(ctx);
4089 s->block = s->block_base;
4090 s->uvblock[0] = s->uvblock_base[0];
4091 s->uvblock[1] = s->uvblock_base[1];
4092 s->eob = s->eob_base;
4093 s->uveob[0] = s->uveob_base[0];
4094 s->uveob[1] = s->uveob_base[1];
4096 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4097 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4098 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4100 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4103 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4104 tile_row == s->s.h.tiling.tile_rows - 1) {
4107 tile_size = AV_RB32(data);
4111 if (tile_size > size) {
4112 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4113 return AVERROR_INVALIDDATA;
4115 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4116 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4117 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4118 return AVERROR_INVALIDDATA;
4125 for (row = s->tile_row_start; row < s->tile_row_end;
4126 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4127 struct VP9Filter *lflvl_ptr = s->lflvl;
4128 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4130 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4131 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4132 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4135 memset(s->left_partition_ctx, 0, 8);
4136 memset(s->left_skip_ctx, 0, 8);
4137 if (s->s.h.keyframe || s->s.h.intraonly) {
4138 memset(s->left_mode_ctx, DC_PRED, 16);
4140 memset(s->left_mode_ctx, NEARESTMV, 8);
4142 memset(s->left_y_nnz_ctx, 0, 16);
4143 memset(s->left_uv_nnz_ctx, 0, 32);
4144 memset(s->left_segpred_ctx, 0, 8);
4146 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4149 for (col = s->tile_col_start;
4150 col < s->tile_col_end;
4151 col += 8, yoff2 += 64 * bytesperpixel,
4152 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4153 // FIXME integrate with lf code (i.e. zero after each
4154 // use, similar to invtxfm coefficients, or similar)
4156 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4160 decode_sb_mem(ctx, row, col, lflvl_ptr,
4161 yoff2, uvoff2, BL_64X64);
4163 decode_sb(ctx, row, col, lflvl_ptr,
4164 yoff2, uvoff2, BL_64X64);
4168 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4176 // backup pre-loopfilter reconstruction data for intra
4177 // prediction of next row of sb64s
4178 if (row + 8 < s->rows) {
4179 memcpy(s->intra_pred_data[0],
4180 f->data[0] + yoff + 63 * ls_y,
4181 8 * s->cols * bytesperpixel);
4182 memcpy(s->intra_pred_data[1],
4183 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4184 8 * s->cols * bytesperpixel >> s->ss_h);
4185 memcpy(s->intra_pred_data[2],
4186 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4187 8 * s->cols * bytesperpixel >> s->ss_h);
4190 // loopfilter one row
4191 if (s->s.h.filter.level) {
4194 lflvl_ptr = s->lflvl;
4195 for (col = 0; col < s->cols;
4196 col += 8, yoff2 += 64 * bytesperpixel,
4197 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4198 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4202 // FIXME maybe we can make this more finegrained by running the
4203 // loopfilter per-block instead of after each sbrow
4204 // In fact that would also make intra pred left preparation easier?
4205 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4209 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4211 ff_thread_finish_setup(ctx);
4213 } while (s->pass++ == 1);
4214 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4218 for (i = 0; i < 8; i++) {
4219 if (s->s.refs[i].f->buf[0])
4220 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4221 if (s->next_refs[i].f->buf[0] &&
4222 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4226 if (!s->s.h.invisible) {
4227 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4235 static void vp9_decode_flush(AVCodecContext *ctx)
4237 VP9Context *s = ctx->priv_data;
4240 for (i = 0; i < 3; i++)
4241 vp9_unref_frame(ctx, &s->s.frames[i]);
4242 for (i = 0; i < 8; i++)
4243 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4246 static int init_frames(AVCodecContext *ctx)
4248 VP9Context *s = ctx->priv_data;
4251 for (i = 0; i < 3; i++) {
4252 s->s.frames[i].tf.f = av_frame_alloc();
4253 if (!s->s.frames[i].tf.f) {
4254 vp9_decode_free(ctx);
4255 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4256 return AVERROR(ENOMEM);
4259 for (i = 0; i < 8; i++) {
4260 s->s.refs[i].f = av_frame_alloc();
4261 s->next_refs[i].f = av_frame_alloc();
4262 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4263 vp9_decode_free(ctx);
4264 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4265 return AVERROR(ENOMEM);
4272 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4274 VP9Context *s = ctx->priv_data;
4276 ctx->internal->allocate_progress = 1;
4278 s->s.h.filter.sharpness = -1;
4280 return init_frames(ctx);
4284 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4286 return init_frames(avctx);
4289 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4292 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4294 // detect size changes in other threads
4295 if (s->intra_pred_data[0] &&
4296 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols ||
4297 s->rows != ssrc->rows || s->bpp != ssrc->bpp || s->pix_fmt != ssrc->pix_fmt)) {
4301 for (i = 0; i < 3; i++) {
4302 if (s->s.frames[i].tf.f->buf[0])
4303 vp9_unref_frame(dst, &s->s.frames[i]);
4304 if (ssrc->s.frames[i].tf.f->buf[0]) {
4305 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4309 for (i = 0; i < 8; i++) {
4310 if (s->s.refs[i].f->buf[0])
4311 ff_thread_release_buffer(dst, &s->s.refs[i]);
4312 if (ssrc->next_refs[i].f->buf[0]) {
4313 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4318 s->s.h.invisible = ssrc->s.h.invisible;
4319 s->s.h.keyframe = ssrc->s.h.keyframe;
4320 s->s.h.intraonly = ssrc->s.h.intraonly;
4321 s->ss_v = ssrc->ss_v;
4322 s->ss_h = ssrc->ss_h;
4323 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4324 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4325 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4326 s->bytesperpixel = ssrc->bytesperpixel;
4328 s->bpp_index = ssrc->bpp_index;
4329 s->pix_fmt = ssrc->pix_fmt;
4330 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4331 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4332 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4333 sizeof(s->s.h.segmentation.feat));
4339 static const AVProfile profiles[] = {
4340 { FF_PROFILE_VP9_0, "Profile 0" },
4341 { FF_PROFILE_VP9_1, "Profile 1" },
4342 { FF_PROFILE_VP9_2, "Profile 2" },
4343 { FF_PROFILE_VP9_3, "Profile 3" },
4344 { FF_PROFILE_UNKNOWN },
4347 AVCodec ff_vp9_decoder = {
4349 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4350 .type = AVMEDIA_TYPE_VIDEO,
4351 .id = AV_CODEC_ID_VP9,
4352 .priv_data_size = sizeof(VP9Context),
4353 .init = vp9_decode_init,
4354 .close = vp9_decode_free,
4355 .decode = vp9_decode_frame,
4356 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4357 .flush = vp9_decode_flush,
4358 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4359 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4360 .profiles = NULL_IF_CONFIG_SMALL(profiles),