2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
37 #define VP9_SYNCCODE 0x498342
41 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
45 typedef struct VP9Block {
46 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
47 enum FilterMode filter;
48 VP56mv mv[4 /* b_idx */][2 /* ref */];
50 enum TxfmMode tx, uvtx;
52 enum BlockPartition bp;
55 typedef struct VP9Context {
66 int row, row7, col, col7;
68 ptrdiff_t y_stride, uv_stride;
71 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
72 uint8_t last_keyframe;
73 // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74 // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75 // and are therefore per-stream. pix_fmt represents the value in the header
76 // of the currently processed frame.
78 enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79 unsigned sb_cols, sb_rows, rows, cols;
80 ThreadFrame next_refs[8];
84 uint8_t mblim_lut[64];
86 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
89 uint8_t coef[4][2][2][6][6][3];
93 uint8_t coef[4][2][2][6][6][11];
96 unsigned y_mode[4][10];
97 unsigned uv_mode[10][10];
98 unsigned filter[4][3];
99 unsigned mv_mode[7][4];
100 unsigned intra[4][2];
102 unsigned single_ref[5][2][2];
103 unsigned comp_ref[5][2];
104 unsigned tx32p[2][4];
105 unsigned tx16p[2][3];
108 unsigned mv_joint[4];
111 unsigned classes[11];
113 unsigned bits[10][2];
114 unsigned class0_fp[2][4];
116 unsigned class0_hp[2];
119 unsigned partition[4][4][4];
120 unsigned coef[4][2][2][6][6][3];
121 unsigned eob[4][2][2][6][6][2];
124 // contextual (left/above) cache
125 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
126 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
127 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
128 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
129 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
131 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
132 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
133 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
134 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
135 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
136 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
137 uint8_t *above_partition_ctx;
138 uint8_t *above_mode_ctx;
139 // FIXME maybe merge some of the below in a flags field?
140 uint8_t *above_y_nnz_ctx;
141 uint8_t *above_uv_nnz_ctx[2];
142 uint8_t *above_skip_ctx; // 1bit
143 uint8_t *above_txfm_ctx; // 2bit
144 uint8_t *above_segpred_ctx; // 1bit
145 uint8_t *above_intra_ctx; // 1bit
146 uint8_t *above_comp_ctx; // 1bit
147 uint8_t *above_ref_ctx; // 2bit
148 uint8_t *above_filter_ctx;
149 VP56mv (*above_mv_ctx)[2];
152 uint8_t *intra_pred_data[3];
153 struct VP9Filter *lflvl;
154 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
156 // block reconstruction intermediates
157 int block_alloc_using_2pass;
158 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
159 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
160 struct { int x, y; } min_mv, max_mv;
161 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163 uint16_t mvscale[3][2];
164 uint8_t mvstep[3][2];
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
169 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
172 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
177 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
179 ff_thread_release_buffer(ctx, &f->tf);
180 av_buffer_unref(&f->extradata);
181 av_buffer_unref(&f->hwaccel_priv_buf);
182 f->segmentation_map = NULL;
183 f->hwaccel_picture_private = NULL;
186 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
188 VP9Context *s = ctx->priv_data;
191 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
193 sz = 64 * s->sb_cols * s->sb_rows;
194 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
198 f->segmentation_map = f->extradata->data;
199 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
202 const AVHWAccel *hwaccel = ctx->hwaccel;
203 av_assert0(!f->hwaccel_picture_private);
204 if (hwaccel->frame_priv_data_size) {
205 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
206 if (!f->hwaccel_priv_buf)
208 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
215 vp9_unref_frame(ctx, f);
216 return AVERROR(ENOMEM);
219 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
223 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
225 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
229 dst->segmentation_map = src->segmentation_map;
231 dst->uses_2pass = src->uses_2pass;
233 if (src->hwaccel_picture_private) {
234 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
235 if (!dst->hwaccel_priv_buf)
237 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
243 vp9_unref_frame(ctx, dst);
244 return AVERROR(ENOMEM);
247 static int update_size(AVCodecContext *ctx, int w, int h)
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251 VP9Context *s = ctx->priv_data;
253 int bytesperpixel = s->bytesperpixel, res, cols, rows;
255 av_assert0(w > 0 && h > 0);
257 if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
261 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
262 #if CONFIG_VP9_DXVA2_HWACCEL
263 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
265 #if CONFIG_VP9_D3D11VA_HWACCEL
266 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
268 #if CONFIG_VP9_VAAPI_HWACCEL
269 *fmtp++ = AV_PIX_FMT_VAAPI;
273 *fmtp++ = s->pix_fmt;
274 *fmtp = AV_PIX_FMT_NONE;
276 res = ff_thread_get_format(ctx, pix_fmts);
281 s->gf_fmt = s->pix_fmt;
289 if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
292 s->last_fmt = s->pix_fmt;
293 s->sb_cols = (w + 63) >> 6;
294 s->sb_rows = (h + 63) >> 6;
295 s->cols = (w + 7) >> 3;
296 s->rows = (h + 7) >> 3;
298 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
299 av_freep(&s->intra_pred_data[0]);
300 // FIXME we slightly over-allocate here for subsampled chroma, but a little
301 // bit of padding shouldn't affect performance...
302 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
303 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
305 return AVERROR(ENOMEM);
306 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
307 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
308 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
309 assign(s->above_y_nnz_ctx, uint8_t *, 16);
310 assign(s->above_mode_ctx, uint8_t *, 16);
311 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
312 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
313 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
314 assign(s->above_partition_ctx, uint8_t *, 8);
315 assign(s->above_skip_ctx, uint8_t *, 8);
316 assign(s->above_txfm_ctx, uint8_t *, 8);
317 assign(s->above_segpred_ctx, uint8_t *, 8);
318 assign(s->above_intra_ctx, uint8_t *, 8);
319 assign(s->above_comp_ctx, uint8_t *, 8);
320 assign(s->above_ref_ctx, uint8_t *, 8);
321 assign(s->above_filter_ctx, uint8_t *, 8);
322 assign(s->lflvl, struct VP9Filter *, 1);
325 // these will be re-allocated a little later
326 av_freep(&s->b_base);
327 av_freep(&s->block_base);
329 if (s->bpp != s->last_bpp) {
330 ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
331 ff_videodsp_init(&s->vdsp, s->bpp);
332 s->last_bpp = s->bpp;
338 static int update_block_buffers(AVCodecContext *ctx)
340 VP9Context *s = ctx->priv_data;
341 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
343 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
347 av_free(s->block_base);
348 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
349 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
350 if (s->s.frames[CUR_FRAME].uses_2pass) {
351 int sbs = s->sb_cols * s->sb_rows;
353 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
354 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
355 16 * 16 + 2 * chroma_eobs) * sbs);
356 if (!s->b_base || !s->block_base)
357 return AVERROR(ENOMEM);
358 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
359 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
360 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
361 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
362 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
364 s->b_base = av_malloc(sizeof(VP9Block));
365 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
366 16 * 16 + 2 * chroma_eobs);
367 if (!s->b_base || !s->block_base)
368 return AVERROR(ENOMEM);
369 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
370 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
371 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
372 s->uveob_base[0] = s->eob_base + 16 * 16;
373 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
375 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
380 // for some reason the sign bit is at the end, not the start, of a bit sequence
381 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
383 int v = get_bits(gb, n);
384 return get_bits1(gb) ? -v : v;
387 static av_always_inline int inv_recenter_nonneg(int v, int m)
389 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
392 // differential forward probability updates
393 static int update_prob(VP56RangeCoder *c, int p)
395 static const int inv_map_table[255] = {
396 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
397 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
398 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
399 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
400 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
401 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
402 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
403 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
404 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
405 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
406 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
407 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
408 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
409 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
410 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
411 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
412 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
413 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
418 /* This code is trying to do a differential probability update. For a
419 * current probability A in the range [1, 255], the difference to a new
420 * probability of any value can be expressed differentially as 1-A,255-A
421 * where some part of this (absolute range) exists both in positive as
422 * well as the negative part, whereas another part only exists in one
423 * half. We're trying to code this shared part differentially, i.e.
424 * times two where the value of the lowest bit specifies the sign, and
425 * the single part is then coded on top of this. This absolute difference
426 * then again has a value of [0,254], but a bigger value in this range
427 * indicates that we're further away from the original value A, so we
428 * can code this as a VLC code, since higher values are increasingly
429 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
430 * updates vs. the 'fine, exact' updates further down the range, which
431 * adds one extra dimension to this differential update model. */
433 if (!vp8_rac_get(c)) {
434 d = vp8_rac_get_uint(c, 4) + 0;
435 } else if (!vp8_rac_get(c)) {
436 d = vp8_rac_get_uint(c, 4) + 16;
437 } else if (!vp8_rac_get(c)) {
438 d = vp8_rac_get_uint(c, 5) + 32;
440 d = vp8_rac_get_uint(c, 7);
442 d = (d << 1) - 65 + vp8_rac_get(c);
444 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
447 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
448 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
451 static int read_colorspace_details(AVCodecContext *ctx)
453 static const enum AVColorSpace colorspaces[8] = {
454 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
455 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
457 VP9Context *s = ctx->priv_data;
458 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
461 s->bpp = 8 + bits * 2;
462 s->bytesperpixel = (7 + s->bpp) >> 3;
463 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
464 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
465 static const enum AVPixelFormat pix_fmt_rgb[3] = {
466 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
468 s->ss_h = s->ss_v = 0;
469 ctx->color_range = AVCOL_RANGE_JPEG;
470 s->pix_fmt = pix_fmt_rgb[bits];
471 if (ctx->profile & 1) {
472 if (get_bits1(&s->gb)) {
473 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
474 return AVERROR_INVALIDDATA;
477 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
479 return AVERROR_INVALIDDATA;
482 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
483 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
484 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
485 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
486 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
487 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
488 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
490 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
491 if (ctx->profile & 1) {
492 s->ss_h = get_bits1(&s->gb);
493 s->ss_v = get_bits1(&s->gb);
494 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
495 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
496 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
498 return AVERROR_INVALIDDATA;
499 } else if (get_bits1(&s->gb)) {
500 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
502 return AVERROR_INVALIDDATA;
505 s->ss_h = s->ss_v = 1;
506 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
513 static int decode_frame_header(AVCodecContext *ctx,
514 const uint8_t *data, int size, int *ref)
516 VP9Context *s = ctx->priv_data;
517 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
519 const uint8_t *data2;
522 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
523 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
526 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
527 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
528 return AVERROR_INVALIDDATA;
530 ctx->profile = get_bits1(&s->gb);
531 ctx->profile |= get_bits1(&s->gb) << 1;
532 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
533 if (ctx->profile > 3) {
534 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
535 return AVERROR_INVALIDDATA;
537 s->s.h.profile = ctx->profile;
538 if (get_bits1(&s->gb)) {
539 *ref = get_bits(&s->gb, 3);
542 s->last_keyframe = s->s.h.keyframe;
543 s->s.h.keyframe = !get_bits1(&s->gb);
544 last_invisible = s->s.h.invisible;
545 s->s.h.invisible = !get_bits1(&s->gb);
546 s->s.h.errorres = get_bits1(&s->gb);
547 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
548 if (s->s.h.keyframe) {
549 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
550 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
551 return AVERROR_INVALIDDATA;
553 if ((res = read_colorspace_details(ctx)) < 0)
555 // for profile 1, here follows the subsampling bits
556 s->s.h.refreshrefmask = 0xff;
557 w = get_bits(&s->gb, 16) + 1;
558 h = get_bits(&s->gb, 16) + 1;
559 if (get_bits1(&s->gb)) // display size
560 skip_bits(&s->gb, 32);
562 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
563 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
564 if (s->s.h.intraonly) {
565 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
566 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
567 return AVERROR_INVALIDDATA;
569 if (ctx->profile >= 1) {
570 if ((res = read_colorspace_details(ctx)) < 0)
573 s->ss_h = s->ss_v = 1;
576 s->bytesperpixel = 1;
577 s->pix_fmt = AV_PIX_FMT_YUV420P;
578 ctx->colorspace = AVCOL_SPC_BT470BG;
579 ctx->color_range = AVCOL_RANGE_JPEG;
581 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
582 w = get_bits(&s->gb, 16) + 1;
583 h = get_bits(&s->gb, 16) + 1;
584 if (get_bits1(&s->gb)) // display size
585 skip_bits(&s->gb, 32);
587 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
588 s->s.h.refidx[0] = get_bits(&s->gb, 3);
589 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
590 s->s.h.refidx[1] = get_bits(&s->gb, 3);
591 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
592 s->s.h.refidx[2] = get_bits(&s->gb, 3);
593 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
594 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
595 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
596 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
597 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
598 return AVERROR_INVALIDDATA;
600 if (get_bits1(&s->gb)) {
601 w = s->s.refs[s->s.h.refidx[0]].f->width;
602 h = s->s.refs[s->s.h.refidx[0]].f->height;
603 } else if (get_bits1(&s->gb)) {
604 w = s->s.refs[s->s.h.refidx[1]].f->width;
605 h = s->s.refs[s->s.h.refidx[1]].f->height;
606 } else if (get_bits1(&s->gb)) {
607 w = s->s.refs[s->s.h.refidx[2]].f->width;
608 h = s->s.refs[s->s.h.refidx[2]].f->height;
610 w = get_bits(&s->gb, 16) + 1;
611 h = get_bits(&s->gb, 16) + 1;
613 // Note that in this code, "CUR_FRAME" is actually before we
614 // have formally allocated a frame, and thus actually represents
616 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
617 s->s.frames[CUR_FRAME].tf.f->height == h;
618 if (get_bits1(&s->gb)) // display size
619 skip_bits(&s->gb, 32);
620 s->s.h.highprecisionmvs = get_bits1(&s->gb);
621 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
623 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
624 s->s.h.signbias[0] != s->s.h.signbias[2];
625 if (s->s.h.allowcompinter) {
626 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
627 s->s.h.fixcompref = 2;
628 s->s.h.varcompref[0] = 0;
629 s->s.h.varcompref[1] = 1;
630 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
631 s->s.h.fixcompref = 1;
632 s->s.h.varcompref[0] = 0;
633 s->s.h.varcompref[1] = 2;
635 s->s.h.fixcompref = 0;
636 s->s.h.varcompref[0] = 1;
637 s->s.h.varcompref[1] = 2;
642 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
643 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
644 s->s.h.framectxid = c = get_bits(&s->gb, 2);
646 /* loopfilter header data */
647 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
648 // reset loopfilter defaults
649 s->s.h.lf_delta.ref[0] = 1;
650 s->s.h.lf_delta.ref[1] = 0;
651 s->s.h.lf_delta.ref[2] = -1;
652 s->s.h.lf_delta.ref[3] = -1;
653 s->s.h.lf_delta.mode[0] = 0;
654 s->s.h.lf_delta.mode[1] = 0;
655 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
657 s->s.h.filter.level = get_bits(&s->gb, 6);
658 sharp = get_bits(&s->gb, 3);
659 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
660 // the old cache values since they are still valid
661 if (s->s.h.filter.sharpness != sharp)
662 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
663 s->s.h.filter.sharpness = sharp;
664 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
665 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
666 for (i = 0; i < 4; i++)
667 if (get_bits1(&s->gb))
668 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
669 for (i = 0; i < 2; i++)
670 if (get_bits1(&s->gb))
671 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
675 /* quantization header data */
676 s->s.h.yac_qi = get_bits(&s->gb, 8);
677 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
678 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
679 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
680 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
681 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
683 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
685 /* segmentation header info */
686 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
687 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
688 for (i = 0; i < 7; i++)
689 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
690 get_bits(&s->gb, 8) : 255;
691 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
692 for (i = 0; i < 3; i++)
693 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
694 get_bits(&s->gb, 8) : 255;
698 if (get_bits1(&s->gb)) {
699 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
700 for (i = 0; i < 8; i++) {
701 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
702 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
703 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
704 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
705 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
706 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
707 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
712 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
713 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
714 int qyac, qydc, quvac, quvdc, lflvl, sh;
716 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
717 if (s->s.h.segmentation.absolute_vals)
718 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
720 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
722 qyac = s->s.h.yac_qi;
724 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
725 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
726 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
727 qyac = av_clip_uintp2(qyac, 8);
729 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
730 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
731 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
732 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
734 sh = s->s.h.filter.level >= 32;
735 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
736 if (s->s.h.segmentation.absolute_vals)
737 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
739 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
741 lflvl = s->s.h.filter.level;
743 if (s->s.h.lf_delta.enabled) {
744 s->s.h.segmentation.feat[i].lflvl[0][0] =
745 s->s.h.segmentation.feat[i].lflvl[0][1] =
746 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
747 for (j = 1; j < 4; j++) {
748 s->s.h.segmentation.feat[i].lflvl[j][0] =
749 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
750 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
751 s->s.h.segmentation.feat[i].lflvl[j][1] =
752 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
753 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
756 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
757 sizeof(s->s.h.segmentation.feat[i].lflvl));
762 if ((res = update_size(ctx, w, h)) < 0) {
763 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
767 for (s->s.h.tiling.log2_tile_cols = 0;
768 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
769 s->s.h.tiling.log2_tile_cols++) ;
770 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
771 max = FFMAX(0, max - 1);
772 while (max > s->s.h.tiling.log2_tile_cols) {
773 if (get_bits1(&s->gb))
774 s->s.h.tiling.log2_tile_cols++;
778 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
779 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
780 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
781 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
782 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
783 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
785 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
786 return AVERROR(ENOMEM);
790 /* check reference frames */
791 if (!s->s.h.keyframe && !s->s.h.intraonly) {
792 for (i = 0; i < 3; i++) {
793 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
794 int refw = ref->width, refh = ref->height;
796 if (ref->format != ctx->pix_fmt) {
797 av_log(ctx, AV_LOG_ERROR,
798 "Ref pixfmt (%s) did not match current frame (%s)",
799 av_get_pix_fmt_name(ref->format),
800 av_get_pix_fmt_name(ctx->pix_fmt));
801 return AVERROR_INVALIDDATA;
802 } else if (refw == w && refh == h) {
803 s->mvscale[i][0] = s->mvscale[i][1] = 0;
805 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
806 av_log(ctx, AV_LOG_ERROR,
807 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
809 return AVERROR_INVALIDDATA;
811 s->mvscale[i][0] = (refw << 14) / w;
812 s->mvscale[i][1] = (refh << 14) / h;
813 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
814 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
819 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
820 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
821 s->prob_ctx[3].p = vp9_default_probs;
822 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
823 sizeof(vp9_default_coef_probs));
824 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
825 sizeof(vp9_default_coef_probs));
826 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
827 sizeof(vp9_default_coef_probs));
828 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
829 sizeof(vp9_default_coef_probs));
830 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
831 s->prob_ctx[c].p = vp9_default_probs;
832 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
833 sizeof(vp9_default_coef_probs));
836 // next 16 bits is size of the rest of the header (arith-coded)
837 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
838 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
840 data2 = align_get_bits(&s->gb);
841 if (size2 > size - (data2 - data)) {
842 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
843 return AVERROR_INVALIDDATA;
845 ff_vp56_init_range_decoder(&s->c, data2, size2);
846 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
847 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
848 return AVERROR_INVALIDDATA;
851 if (s->s.h.keyframe || s->s.h.intraonly) {
852 memset(s->counts.coef, 0, sizeof(s->counts.coef));
853 memset(s->counts.eob, 0, sizeof(s->counts.eob));
855 memset(&s->counts, 0, sizeof(s->counts));
857 // FIXME is it faster to not copy here, but do it down in the fw updates
858 // as explicit copies if the fw update is missing (and skip the copy upon
860 s->prob.p = s->prob_ctx[c].p;
863 if (s->s.h.lossless) {
864 s->s.h.txfmmode = TX_4X4;
866 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
867 if (s->s.h.txfmmode == 3)
868 s->s.h.txfmmode += vp8_rac_get(&s->c);
870 if (s->s.h.txfmmode == TX_SWITCHABLE) {
871 for (i = 0; i < 2; i++)
872 if (vp56_rac_get_prob_branchy(&s->c, 252))
873 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
874 for (i = 0; i < 2; i++)
875 for (j = 0; j < 2; j++)
876 if (vp56_rac_get_prob_branchy(&s->c, 252))
877 s->prob.p.tx16p[i][j] =
878 update_prob(&s->c, s->prob.p.tx16p[i][j]);
879 for (i = 0; i < 2; i++)
880 for (j = 0; j < 3; j++)
881 if (vp56_rac_get_prob_branchy(&s->c, 252))
882 s->prob.p.tx32p[i][j] =
883 update_prob(&s->c, s->prob.p.tx32p[i][j]);
888 for (i = 0; i < 4; i++) {
889 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
890 if (vp8_rac_get(&s->c)) {
891 for (j = 0; j < 2; j++)
892 for (k = 0; k < 2; k++)
893 for (l = 0; l < 6; l++)
894 for (m = 0; m < 6; m++) {
895 uint8_t *p = s->prob.coef[i][j][k][l][m];
896 uint8_t *r = ref[j][k][l][m];
897 if (m >= 3 && l == 0) // dc only has 3 pt
899 for (n = 0; n < 3; n++) {
900 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
901 p[n] = update_prob(&s->c, r[n]);
909 for (j = 0; j < 2; j++)
910 for (k = 0; k < 2; k++)
911 for (l = 0; l < 6; l++)
912 for (m = 0; m < 6; m++) {
913 uint8_t *p = s->prob.coef[i][j][k][l][m];
914 uint8_t *r = ref[j][k][l][m];
915 if (m > 3 && l == 0) // dc only has 3 pt
921 if (s->s.h.txfmmode == i)
926 for (i = 0; i < 3; i++)
927 if (vp56_rac_get_prob_branchy(&s->c, 252))
928 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
929 if (!s->s.h.keyframe && !s->s.h.intraonly) {
930 for (i = 0; i < 7; i++)
931 for (j = 0; j < 3; j++)
932 if (vp56_rac_get_prob_branchy(&s->c, 252))
933 s->prob.p.mv_mode[i][j] =
934 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
936 if (s->s.h.filtermode == FILTER_SWITCHABLE)
937 for (i = 0; i < 4; i++)
938 for (j = 0; j < 2; j++)
939 if (vp56_rac_get_prob_branchy(&s->c, 252))
940 s->prob.p.filter[i][j] =
941 update_prob(&s->c, s->prob.p.filter[i][j]);
943 for (i = 0; i < 4; i++)
944 if (vp56_rac_get_prob_branchy(&s->c, 252))
945 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
947 if (s->s.h.allowcompinter) {
948 s->s.h.comppredmode = vp8_rac_get(&s->c);
949 if (s->s.h.comppredmode)
950 s->s.h.comppredmode += vp8_rac_get(&s->c);
951 if (s->s.h.comppredmode == PRED_SWITCHABLE)
952 for (i = 0; i < 5; i++)
953 if (vp56_rac_get_prob_branchy(&s->c, 252))
955 update_prob(&s->c, s->prob.p.comp[i]);
957 s->s.h.comppredmode = PRED_SINGLEREF;
960 if (s->s.h.comppredmode != PRED_COMPREF) {
961 for (i = 0; i < 5; i++) {
962 if (vp56_rac_get_prob_branchy(&s->c, 252))
963 s->prob.p.single_ref[i][0] =
964 update_prob(&s->c, s->prob.p.single_ref[i][0]);
965 if (vp56_rac_get_prob_branchy(&s->c, 252))
966 s->prob.p.single_ref[i][1] =
967 update_prob(&s->c, s->prob.p.single_ref[i][1]);
971 if (s->s.h.comppredmode != PRED_SINGLEREF) {
972 for (i = 0; i < 5; i++)
973 if (vp56_rac_get_prob_branchy(&s->c, 252))
974 s->prob.p.comp_ref[i] =
975 update_prob(&s->c, s->prob.p.comp_ref[i]);
978 for (i = 0; i < 4; i++)
979 for (j = 0; j < 9; j++)
980 if (vp56_rac_get_prob_branchy(&s->c, 252))
981 s->prob.p.y_mode[i][j] =
982 update_prob(&s->c, s->prob.p.y_mode[i][j]);
984 for (i = 0; i < 4; i++)
985 for (j = 0; j < 4; j++)
986 for (k = 0; k < 3; k++)
987 if (vp56_rac_get_prob_branchy(&s->c, 252))
988 s->prob.p.partition[3 - i][j][k] =
989 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
991 // mv fields don't use the update_prob subexp model for some reason
992 for (i = 0; i < 3; i++)
993 if (vp56_rac_get_prob_branchy(&s->c, 252))
994 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
996 for (i = 0; i < 2; i++) {
997 if (vp56_rac_get_prob_branchy(&s->c, 252))
998 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1000 for (j = 0; j < 10; j++)
1001 if (vp56_rac_get_prob_branchy(&s->c, 252))
1002 s->prob.p.mv_comp[i].classes[j] =
1003 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1005 if (vp56_rac_get_prob_branchy(&s->c, 252))
1006 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1008 for (j = 0; j < 10; j++)
1009 if (vp56_rac_get_prob_branchy(&s->c, 252))
1010 s->prob.p.mv_comp[i].bits[j] =
1011 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1014 for (i = 0; i < 2; i++) {
1015 for (j = 0; j < 2; j++)
1016 for (k = 0; k < 3; k++)
1017 if (vp56_rac_get_prob_branchy(&s->c, 252))
1018 s->prob.p.mv_comp[i].class0_fp[j][k] =
1019 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1021 for (j = 0; j < 3; j++)
1022 if (vp56_rac_get_prob_branchy(&s->c, 252))
1023 s->prob.p.mv_comp[i].fp[j] =
1024 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1027 if (s->s.h.highprecisionmvs) {
1028 for (i = 0; i < 2; i++) {
1029 if (vp56_rac_get_prob_branchy(&s->c, 252))
1030 s->prob.p.mv_comp[i].class0_hp =
1031 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1033 if (vp56_rac_get_prob_branchy(&s->c, 252))
1034 s->prob.p.mv_comp[i].hp =
1035 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1040 return (data2 - data) + size2;
1043 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1046 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1047 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1050 static void find_ref_mvs(VP9Context *s,
1051 VP56mv *pmv, int ref, int z, int idx, int sb)
1053 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1054 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1055 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1056 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1057 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1058 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1059 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1060 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1061 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1062 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1063 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1064 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1065 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1066 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1067 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1068 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1069 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1070 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1071 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1072 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1073 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1074 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1075 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1076 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1077 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1078 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1079 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1082 int row = s->row, col = s->col, row7 = s->row7;
1083 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1084 #define INVALID_MV 0x80008000U
1085 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1088 #define RETURN_DIRECT_MV(mv) \
1090 uint32_t m = AV_RN32A(&mv); \
1094 } else if (mem == INVALID_MV) { \
1096 } else if (m != mem) { \
1103 if (sb == 2 || sb == 1) {
1104 RETURN_DIRECT_MV(b->mv[0][z]);
1105 } else if (sb == 3) {
1106 RETURN_DIRECT_MV(b->mv[2][z]);
1107 RETURN_DIRECT_MV(b->mv[1][z]);
1108 RETURN_DIRECT_MV(b->mv[0][z]);
1111 #define RETURN_MV(mv) \
1116 av_assert2(idx == 1); \
1117 av_assert2(mem != INVALID_MV); \
1118 if (mem_sub8x8 == INVALID_MV) { \
1119 clamp_mv(&tmp, &mv, s); \
1120 m = AV_RN32A(&tmp); \
1125 mem_sub8x8 = AV_RN32A(&mv); \
1126 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1127 clamp_mv(&tmp, &mv, s); \
1128 m = AV_RN32A(&tmp); \
1132 /* BUG I'm pretty sure this isn't the intention */ \
1138 uint32_t m = AV_RN32A(&mv); \
1140 clamp_mv(pmv, &mv, s); \
1142 } else if (mem == INVALID_MV) { \
1144 } else if (m != mem) { \
1145 clamp_mv(pmv, &mv, s); \
1152 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1153 if (mv->ref[0] == ref) {
1154 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1155 } else if (mv->ref[1] == ref) {
1156 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1159 if (col > s->tile_col_start) {
1160 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1161 if (mv->ref[0] == ref) {
1162 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1163 } else if (mv->ref[1] == ref) {
1164 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1172 // previously coded MVs in this neighbourhood, using same reference frame
1173 for (; i < 8; i++) {
1174 int c = p[i][0] + col, r = p[i][1] + row;
1176 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1177 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1179 if (mv->ref[0] == ref) {
1180 RETURN_MV(mv->mv[0]);
1181 } else if (mv->ref[1] == ref) {
1182 RETURN_MV(mv->mv[1]);
1187 // MV at this position in previous frame, using same reference frame
1188 if (s->s.h.use_last_frame_mvs) {
1189 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1191 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1192 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1193 if (mv->ref[0] == ref) {
1194 RETURN_MV(mv->mv[0]);
1195 } else if (mv->ref[1] == ref) {
1196 RETURN_MV(mv->mv[1]);
1200 #define RETURN_SCALE_MV(mv, scale) \
1203 VP56mv mv_temp = { -mv.x, -mv.y }; \
1204 RETURN_MV(mv_temp); \
1210 // previously coded MVs in this neighbourhood, using different reference frame
1211 for (i = 0; i < 8; i++) {
1212 int c = p[i][0] + col, r = p[i][1] + row;
1214 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1215 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1217 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1218 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1220 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1221 // BUG - libvpx has this condition regardless of whether
1222 // we used the first ref MV and pre-scaling
1223 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1224 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1229 // MV at this position in previous frame, using different reference frame
1230 if (s->s.h.use_last_frame_mvs) {
1231 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1233 // no need to await_progress, because we already did that above
1234 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1235 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1237 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1238 // BUG - libvpx has this condition regardless of whether
1239 // we used the first ref MV and pre-scaling
1240 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1241 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1246 clamp_mv(pmv, pmv, s);
1249 #undef RETURN_SCALE_MV
1252 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1254 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1255 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1256 s->prob.p.mv_comp[idx].classes);
1258 s->counts.mv_comp[idx].sign[sign]++;
1259 s->counts.mv_comp[idx].classes[c]++;
1263 for (n = 0, m = 0; m < c; m++) {
1264 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1266 s->counts.mv_comp[idx].bits[m][bit]++;
1269 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1271 s->counts.mv_comp[idx].fp[bit]++;
1273 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1274 s->counts.mv_comp[idx].hp[bit]++;
1278 // bug in libvpx - we count for bw entropy purposes even if the
1280 s->counts.mv_comp[idx].hp[1]++;
1284 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1285 s->counts.mv_comp[idx].class0[n]++;
1286 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1287 s->prob.p.mv_comp[idx].class0_fp[n]);
1288 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1289 n = (n << 3) | (bit << 1);
1291 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1292 s->counts.mv_comp[idx].class0_hp[bit]++;
1296 // bug in libvpx - we count for bw entropy purposes even if the
1298 s->counts.mv_comp[idx].class0_hp[1]++;
1302 return sign ? -(n + 1) : (n + 1);
1305 static void fill_mv(VP9Context *s,
1306 VP56mv *mv, int mode, int sb)
1310 if (mode == ZEROMV) {
1315 // FIXME cache this value and reuse for other subblocks
1316 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1317 mode == NEWMV ? -1 : sb);
1318 // FIXME maybe move this code into find_ref_mvs()
1319 if ((mode == NEWMV || sb == -1) &&
1320 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1334 if (mode == NEWMV) {
1335 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1336 s->prob.p.mv_joint);
1338 s->counts.mv_joint[j]++;
1339 if (j >= MV_JOINT_V)
1340 mv[0].y += read_mv_component(s, 0, hp);
1342 mv[0].x += read_mv_component(s, 1, hp);
1346 // FIXME cache this value and reuse for other subblocks
1347 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1348 mode == NEWMV ? -1 : sb);
1349 if ((mode == NEWMV || sb == -1) &&
1350 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1364 if (mode == NEWMV) {
1365 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1366 s->prob.p.mv_joint);
1368 s->counts.mv_joint[j]++;
1369 if (j >= MV_JOINT_V)
1370 mv[1].y += read_mv_component(s, 0, hp);
1372 mv[1].x += read_mv_component(s, 1, hp);
1378 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1379 ptrdiff_t stride, int v)
1389 int v16 = v * 0x0101;
1397 uint32_t v32 = v * 0x01010101;
1406 uint64_t v64 = v * 0x0101010101010101ULL;
1412 uint32_t v32 = v * 0x01010101;
1415 AV_WN32A(ptr + 4, v32);
1424 static void decode_mode(AVCodecContext *ctx)
1426 static const uint8_t left_ctx[N_BS_SIZES] = {
1427 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1429 static const uint8_t above_ctx[N_BS_SIZES] = {
1430 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1432 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1433 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1434 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1436 VP9Context *s = ctx->priv_data;
1438 int row = s->row, col = s->col, row7 = s->row7;
1439 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1440 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1441 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1442 int have_a = row > 0, have_l = col > s->tile_col_start;
1443 int vref, filter_id;
1445 if (!s->s.h.segmentation.enabled) {
1447 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1448 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1449 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1450 } else if (!s->s.h.segmentation.update_map ||
1451 (s->s.h.segmentation.temporal &&
1452 vp56_rac_get_prob_branchy(&s->c,
1453 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1454 s->left_segpred_ctx[row7]]))) {
1455 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1457 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1459 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1460 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1461 for (y = 0; y < h4; y++) {
1462 int idx_base = (y + row) * 8 * s->sb_cols + col;
1463 for (x = 0; x < w4; x++)
1464 pred = FFMIN(pred, refsegmap[idx_base + x]);
1466 av_assert1(pred < 8);
1472 memset(&s->above_segpred_ctx[col], 1, w4);
1473 memset(&s->left_segpred_ctx[row7], 1, h4);
1475 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1476 s->s.h.segmentation.prob);
1478 memset(&s->above_segpred_ctx[col], 0, w4);
1479 memset(&s->left_segpred_ctx[row7], 0, h4);
1481 if (s->s.h.segmentation.enabled &&
1482 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1483 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1484 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1487 b->skip = s->s.h.segmentation.enabled &&
1488 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1490 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1491 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1492 s->counts.skip[c][b->skip]++;
1495 if (s->s.h.keyframe || s->s.h.intraonly) {
1497 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1498 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1502 if (have_a && have_l) {
1503 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1506 c = have_a ? 2 * s->above_intra_ctx[col] :
1507 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1509 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1510 s->counts.intra[c][bit]++;
1514 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1518 c = (s->above_skip_ctx[col] ? max_tx :
1519 s->above_txfm_ctx[col]) +
1520 (s->left_skip_ctx[row7] ? max_tx :
1521 s->left_txfm_ctx[row7]) > max_tx;
1523 c = s->above_skip_ctx[col] ? 1 :
1524 (s->above_txfm_ctx[col] * 2 > max_tx);
1526 } else if (have_l) {
1527 c = s->left_skip_ctx[row7] ? 1 :
1528 (s->left_txfm_ctx[row7] * 2 > max_tx);
1534 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1536 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1538 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1540 s->counts.tx32p[c][b->tx]++;
1543 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1545 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1546 s->counts.tx16p[c][b->tx]++;
1549 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1550 s->counts.tx8p[c][b->tx]++;
1557 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1560 if (s->s.h.keyframe || s->s.h.intraonly) {
1561 uint8_t *a = &s->above_mode_ctx[col * 2];
1562 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1565 if (b->bs > BS_8x8) {
1566 // FIXME the memory storage intermediates here aren't really
1567 // necessary, they're just there to make the code slightly
1569 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1570 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1571 if (b->bs != BS_8x4) {
1572 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1573 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1574 l[0] = a[1] = b->mode[1];
1576 l[0] = a[1] = b->mode[1] = b->mode[0];
1578 if (b->bs != BS_4x8) {
1579 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1580 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1581 if (b->bs != BS_8x4) {
1582 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1583 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1584 l[1] = a[1] = b->mode[3];
1586 l[1] = a[1] = b->mode[3] = b->mode[2];
1589 b->mode[2] = b->mode[0];
1590 l[1] = a[1] = b->mode[3] = b->mode[1];
1593 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1594 vp9_default_kf_ymode_probs[*a][*l]);
1595 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1596 // FIXME this can probably be optimized
1597 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1598 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1600 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1601 vp9_default_kf_uvmode_probs[b->mode[3]]);
1602 } else if (b->intra) {
1604 if (b->bs > BS_8x8) {
1605 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1606 s->prob.p.y_mode[0]);
1607 s->counts.y_mode[0][b->mode[0]]++;
1608 if (b->bs != BS_8x4) {
1609 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1610 s->prob.p.y_mode[0]);
1611 s->counts.y_mode[0][b->mode[1]]++;
1613 b->mode[1] = b->mode[0];
1615 if (b->bs != BS_4x8) {
1616 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1617 s->prob.p.y_mode[0]);
1618 s->counts.y_mode[0][b->mode[2]]++;
1619 if (b->bs != BS_8x4) {
1620 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1621 s->prob.p.y_mode[0]);
1622 s->counts.y_mode[0][b->mode[3]]++;
1624 b->mode[3] = b->mode[2];
1627 b->mode[2] = b->mode[0];
1628 b->mode[3] = b->mode[1];
1631 static const uint8_t size_group[10] = {
1632 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1634 int sz = size_group[b->bs];
1636 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1637 s->prob.p.y_mode[sz]);
1638 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1639 s->counts.y_mode[sz][b->mode[3]]++;
1641 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1642 s->prob.p.uv_mode[b->mode[3]]);
1643 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1645 static const uint8_t inter_mode_ctx_lut[14][14] = {
1646 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1647 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1648 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1649 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1650 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1651 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1652 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1653 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1654 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1655 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1656 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1657 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1658 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1659 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1662 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1663 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1665 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1667 // read comp_pred flag
1668 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1669 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1673 // FIXME add intra as ref=0xff (or -1) to make these easier?
1676 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1678 } else if (s->above_comp_ctx[col]) {
1679 c = 2 + (s->left_intra_ctx[row7] ||
1680 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1681 } else if (s->left_comp_ctx[row7]) {
1682 c = 2 + (s->above_intra_ctx[col] ||
1683 s->above_ref_ctx[col] == s->s.h.fixcompref);
1685 c = (!s->above_intra_ctx[col] &&
1686 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1687 (!s->left_intra_ctx[row7] &&
1688 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1691 c = s->above_comp_ctx[col] ? 3 :
1692 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1694 } else if (have_l) {
1695 c = s->left_comp_ctx[row7] ? 3 :
1696 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1700 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1701 s->counts.comp[c][b->comp]++;
1704 // read actual references
1705 // FIXME probably cache a few variables here to prevent repetitive
1706 // memory accesses below
1707 if (b->comp) /* two references */ {
1708 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1710 b->ref[fix_idx] = s->s.h.fixcompref;
1711 // FIXME can this codeblob be replaced by some sort of LUT?
1714 if (s->above_intra_ctx[col]) {
1715 if (s->left_intra_ctx[row7]) {
1718 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1720 } else if (s->left_intra_ctx[row7]) {
1721 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1723 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1725 if (refl == refa && refa == s->s.h.varcompref[1]) {
1727 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1728 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1729 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1732 c = (refa == refl) ? 3 : 1;
1734 } else if (!s->left_comp_ctx[row7]) {
1735 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1738 c = (refl == s->s.h.varcompref[1] &&
1739 refa != s->s.h.varcompref[1]) ? 2 : 4;
1741 } else if (!s->above_comp_ctx[col]) {
1742 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1745 c = (refa == s->s.h.varcompref[1] &&
1746 refl != s->s.h.varcompref[1]) ? 2 : 4;
1749 c = (refl == refa) ? 4 : 2;
1753 if (s->above_intra_ctx[col]) {
1755 } else if (s->above_comp_ctx[col]) {
1756 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1758 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1761 } else if (have_l) {
1762 if (s->left_intra_ctx[row7]) {
1764 } else if (s->left_comp_ctx[row7]) {
1765 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1767 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1772 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1773 b->ref[var_idx] = s->s.h.varcompref[bit];
1774 s->counts.comp_ref[c][bit]++;
1775 } else /* single reference */ {
1778 if (have_a && !s->above_intra_ctx[col]) {
1779 if (have_l && !s->left_intra_ctx[row7]) {
1780 if (s->left_comp_ctx[row7]) {
1781 if (s->above_comp_ctx[col]) {
1782 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1783 !s->above_ref_ctx[col]);
1785 c = (3 * !s->above_ref_ctx[col]) +
1786 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1788 } else if (s->above_comp_ctx[col]) {
1789 c = (3 * !s->left_ref_ctx[row7]) +
1790 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1792 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1794 } else if (s->above_intra_ctx[col]) {
1796 } else if (s->above_comp_ctx[col]) {
1797 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1799 c = 4 * (!s->above_ref_ctx[col]);
1801 } else if (have_l && !s->left_intra_ctx[row7]) {
1802 if (s->left_intra_ctx[row7]) {
1804 } else if (s->left_comp_ctx[row7]) {
1805 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1807 c = 4 * (!s->left_ref_ctx[row7]);
1812 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1813 s->counts.single_ref[c][0][bit]++;
1817 // FIXME can this codeblob be replaced by some sort of LUT?
1820 if (s->left_intra_ctx[row7]) {
1821 if (s->above_intra_ctx[col]) {
1823 } else if (s->above_comp_ctx[col]) {
1824 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1825 s->above_ref_ctx[col] == 1);
1826 } else if (!s->above_ref_ctx[col]) {
1829 c = 4 * (s->above_ref_ctx[col] == 1);
1831 } else if (s->above_intra_ctx[col]) {
1832 if (s->left_intra_ctx[row7]) {
1834 } else if (s->left_comp_ctx[row7]) {
1835 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1836 s->left_ref_ctx[row7] == 1);
1837 } else if (!s->left_ref_ctx[row7]) {
1840 c = 4 * (s->left_ref_ctx[row7] == 1);
1842 } else if (s->above_comp_ctx[col]) {
1843 if (s->left_comp_ctx[row7]) {
1844 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1845 c = 3 * (s->s.h.fixcompref == 1 ||
1846 s->left_ref_ctx[row7] == 1);
1850 } else if (!s->left_ref_ctx[row7]) {
1851 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1852 s->above_ref_ctx[col] == 1);
1854 c = 3 * (s->left_ref_ctx[row7] == 1) +
1855 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1857 } else if (s->left_comp_ctx[row7]) {
1858 if (!s->above_ref_ctx[col]) {
1859 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1860 s->left_ref_ctx[row7] == 1);
1862 c = 3 * (s->above_ref_ctx[col] == 1) +
1863 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1865 } else if (!s->above_ref_ctx[col]) {
1866 if (!s->left_ref_ctx[row7]) {
1869 c = 4 * (s->left_ref_ctx[row7] == 1);
1871 } else if (!s->left_ref_ctx[row7]) {
1872 c = 4 * (s->above_ref_ctx[col] == 1);
1874 c = 2 * (s->left_ref_ctx[row7] == 1) +
1875 2 * (s->above_ref_ctx[col] == 1);
1878 if (s->above_intra_ctx[col] ||
1879 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1881 } else if (s->above_comp_ctx[col]) {
1882 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1884 c = 4 * (s->above_ref_ctx[col] == 1);
1887 } else if (have_l) {
1888 if (s->left_intra_ctx[row7] ||
1889 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1891 } else if (s->left_comp_ctx[row7]) {
1892 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1894 c = 4 * (s->left_ref_ctx[row7] == 1);
1899 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1900 s->counts.single_ref[c][1][bit]++;
1901 b->ref[0] = 1 + bit;
1906 if (b->bs <= BS_8x8) {
1907 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1908 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1910 static const uint8_t off[10] = {
1911 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1914 // FIXME this needs to use the LUT tables from find_ref_mvs
1915 // because not all are -1,0/0,-1
1916 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1917 [s->left_mode_ctx[row7 + off[b->bs]]];
1919 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1920 s->prob.p.mv_mode[c]);
1921 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1922 s->counts.mv_mode[c][b->mode[0] - 10]++;
1926 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1929 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1930 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1931 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1932 s->left_filter_ctx[row7] : 3;
1934 c = s->above_filter_ctx[col];
1936 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1937 c = s->left_filter_ctx[row7];
1942 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1943 s->prob.p.filter[c]);
1944 s->counts.filter[c][filter_id]++;
1945 b->filter = vp9_filter_lut[filter_id];
1947 b->filter = s->s.h.filtermode;
1950 if (b->bs > BS_8x8) {
1951 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1953 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1954 s->prob.p.mv_mode[c]);
1955 s->counts.mv_mode[c][b->mode[0] - 10]++;
1956 fill_mv(s, b->mv[0], b->mode[0], 0);
1958 if (b->bs != BS_8x4) {
1959 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1960 s->prob.p.mv_mode[c]);
1961 s->counts.mv_mode[c][b->mode[1] - 10]++;
1962 fill_mv(s, b->mv[1], b->mode[1], 1);
1964 b->mode[1] = b->mode[0];
1965 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1966 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1969 if (b->bs != BS_4x8) {
1970 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1971 s->prob.p.mv_mode[c]);
1972 s->counts.mv_mode[c][b->mode[2] - 10]++;
1973 fill_mv(s, b->mv[2], b->mode[2], 2);
1975 if (b->bs != BS_8x4) {
1976 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1977 s->prob.p.mv_mode[c]);
1978 s->counts.mv_mode[c][b->mode[3] - 10]++;
1979 fill_mv(s, b->mv[3], b->mode[3], 3);
1981 b->mode[3] = b->mode[2];
1982 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1983 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1986 b->mode[2] = b->mode[0];
1987 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1988 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1989 b->mode[3] = b->mode[1];
1990 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1991 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1994 fill_mv(s, b->mv[0], b->mode[0], -1);
1995 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1996 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1997 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1998 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1999 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2000 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2003 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2007 #define SPLAT_CTX(var, val, n) \
2009 case 1: var = val; break; \
2010 case 2: AV_WN16A(&var, val * 0x0101); break; \
2011 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2012 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2014 uint64_t v64 = val * 0x0101010101010101ULL; \
2015 AV_WN64A( &var, v64); \
2016 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2021 #define SPLAT_CTX(var, val, n) \
2023 case 1: var = val; break; \
2024 case 2: AV_WN16A(&var, val * 0x0101); break; \
2025 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2027 uint32_t v32 = val * 0x01010101; \
2028 AV_WN32A( &var, v32); \
2029 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2033 uint32_t v32 = val * 0x01010101; \
2034 AV_WN32A( &var, v32); \
2035 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2036 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2037 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2043 switch (bwh_tab[1][b->bs][0]) {
2044 #define SET_CTXS(dir, off, n) \
2046 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2047 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2048 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2049 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2050 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2051 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2052 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2054 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2055 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2056 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2061 case 1: SET_CTXS(above, col, 1); break;
2062 case 2: SET_CTXS(above, col, 2); break;
2063 case 4: SET_CTXS(above, col, 4); break;
2064 case 8: SET_CTXS(above, col, 8); break;
2066 switch (bwh_tab[1][b->bs][1]) {
2067 case 1: SET_CTXS(left, row7, 1); break;
2068 case 2: SET_CTXS(left, row7, 2); break;
2069 case 4: SET_CTXS(left, row7, 4); break;
2070 case 8: SET_CTXS(left, row7, 8); break;
2075 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2076 if (b->bs > BS_8x8) {
2077 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2079 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2080 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2081 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2082 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2083 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2084 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2085 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2086 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2088 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2090 for (n = 0; n < w4 * 2; n++) {
2091 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2092 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2094 for (n = 0; n < h4 * 2; n++) {
2095 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2096 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2102 for (y = 0; y < h4; y++) {
2103 int x, o = (row + y) * s->sb_cols * 8 + col;
2104 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2107 for (x = 0; x < w4; x++) {
2111 } else if (b->comp) {
2112 for (x = 0; x < w4; x++) {
2113 mv[x].ref[0] = b->ref[0];
2114 mv[x].ref[1] = b->ref[1];
2115 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2116 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2119 for (x = 0; x < w4; x++) {
2120 mv[x].ref[0] = b->ref[0];
2122 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2128 // FIXME merge cnt/eob arguments?
2129 static av_always_inline int
2130 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2131 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2132 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2133 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2134 const int16_t *band_counts, const int16_t *qmul)
2136 int i = 0, band = 0, band_left = band_counts[band];
2137 uint8_t *tp = p[0][nnz];
2138 uint8_t cache[1024];
2143 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2144 eob[band][nnz][val]++;
2149 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2150 cnt[band][nnz][0]++;
2152 band_left = band_counts[++band];
2154 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2156 if (++i == n_coeffs)
2157 break; //invalid input; blocks should end with EOB
2162 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2163 cnt[band][nnz][1]++;
2167 // fill in p[3-10] (model fill) - only once per frame for each pos
2169 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2171 cnt[band][nnz][2]++;
2172 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2173 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2174 cache[rc] = val = 2;
2176 val = 3 + vp56_rac_get_prob(c, tp[5]);
2179 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2181 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2182 val = 5 + vp56_rac_get_prob(c, 159);
2184 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2185 val += vp56_rac_get_prob(c, 145);
2189 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2190 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2191 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2192 val += (vp56_rac_get_prob(c, 148) << 1);
2193 val += vp56_rac_get_prob(c, 140);
2195 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2196 val += (vp56_rac_get_prob(c, 155) << 2);
2197 val += (vp56_rac_get_prob(c, 140) << 1);
2198 val += vp56_rac_get_prob(c, 135);
2200 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2201 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2202 val += (vp56_rac_get_prob(c, 157) << 3);
2203 val += (vp56_rac_get_prob(c, 141) << 2);
2204 val += (vp56_rac_get_prob(c, 134) << 1);
2205 val += vp56_rac_get_prob(c, 130);
2208 if (!is8bitsperpixel) {
2210 val += vp56_rac_get_prob(c, 255) << 17;
2211 val += vp56_rac_get_prob(c, 255) << 16;
2213 val += (vp56_rac_get_prob(c, 255) << 15);
2214 val += (vp56_rac_get_prob(c, 255) << 14);
2216 val += (vp56_rac_get_prob(c, 254) << 13);
2217 val += (vp56_rac_get_prob(c, 254) << 12);
2218 val += (vp56_rac_get_prob(c, 254) << 11);
2219 val += (vp56_rac_get_prob(c, 252) << 10);
2220 val += (vp56_rac_get_prob(c, 249) << 9);
2221 val += (vp56_rac_get_prob(c, 243) << 8);
2222 val += (vp56_rac_get_prob(c, 230) << 7);
2223 val += (vp56_rac_get_prob(c, 196) << 6);
2224 val += (vp56_rac_get_prob(c, 177) << 5);
2225 val += (vp56_rac_get_prob(c, 153) << 4);
2226 val += (vp56_rac_get_prob(c, 140) << 3);
2227 val += (vp56_rac_get_prob(c, 133) << 2);
2228 val += (vp56_rac_get_prob(c, 130) << 1);
2229 val += vp56_rac_get_prob(c, 129);
2233 #define STORE_COEF(c, i, v) do { \
2234 if (is8bitsperpixel) { \
2237 AV_WN32A(&c[i * 2], v); \
2241 band_left = band_counts[++band];
2243 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2245 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2246 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2248 } while (++i < n_coeffs);
2253 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2254 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2255 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2256 const int16_t (*nb)[2], const int16_t *band_counts,
2257 const int16_t *qmul)
2259 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2260 nnz, scan, nb, band_counts, qmul);
2263 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2264 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2265 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2266 const int16_t (*nb)[2], const int16_t *band_counts,
2267 const int16_t *qmul)
2269 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2270 nnz, scan, nb, band_counts, qmul);
2273 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2274 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2275 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2276 const int16_t (*nb)[2], const int16_t *band_counts,
2277 const int16_t *qmul)
2279 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2280 nnz, scan, nb, band_counts, qmul);
2283 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2284 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2285 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2286 const int16_t (*nb)[2], const int16_t *band_counts,
2287 const int16_t *qmul)
2289 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2290 nnz, scan, nb, band_counts, qmul);
2293 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2295 VP9Context *s = ctx->priv_data;
2297 int row = s->row, col = s->col;
2298 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2299 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2300 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2301 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2302 int end_x = FFMIN(2 * (s->cols - col), w4);
2303 int end_y = FFMIN(2 * (s->rows - row), h4);
2304 int n, pl, x, y, res;
2305 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2306 int tx = 4 * s->s.h.lossless + b->tx;
2307 const int16_t * const *yscans = vp9_scans[tx];
2308 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2309 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2310 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2311 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2312 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2313 static const int16_t band_counts[4][8] = {
2314 { 1, 2, 3, 4, 3, 16 - 13 },
2315 { 1, 2, 3, 4, 11, 64 - 21 },
2316 { 1, 2, 3, 4, 11, 256 - 21 },
2317 { 1, 2, 3, 4, 11, 1024 - 21 },
2319 const int16_t *y_band_counts = band_counts[b->tx];
2320 const int16_t *uv_band_counts = band_counts[b->uvtx];
2321 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2322 int total_coeff = 0;
2324 #define MERGE(la, end, step, rd) \
2325 for (n = 0; n < end; n += step) \
2326 la[n] = !!rd(&la[n])
2327 #define MERGE_CTX(step, rd) \
2329 MERGE(l, end_y, step, rd); \
2330 MERGE(a, end_x, step, rd); \
2333 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2334 for (n = 0, y = 0; y < end_y; y += step) { \
2335 for (x = 0; x < end_x; x += step, n += step * step) { \
2336 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2337 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2338 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2339 c, e, p, a[x] + l[y], yscans[txtp], \
2340 ynbs[txtp], y_band_counts, qmul[0]); \
2341 a[x] = l[y] = !!res; \
2342 total_coeff |= !!res; \
2344 AV_WN16A(&s->eob[n], res); \
2351 #define SPLAT(la, end, step, cond) \
2353 for (n = 1; n < end; n += step) \
2354 la[n] = la[n - 1]; \
2355 } else if (step == 4) { \
2357 for (n = 0; n < end; n += step) \
2358 AV_WN32A(&la[n], la[n] * 0x01010101); \
2360 for (n = 0; n < end; n += step) \
2361 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2363 } else /* step == 8 */ { \
2365 if (HAVE_FAST_64BIT) { \
2366 for (n = 0; n < end; n += step) \
2367 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2369 for (n = 0; n < end; n += step) { \
2370 uint32_t v32 = la[n] * 0x01010101; \
2371 AV_WN32A(&la[n], v32); \
2372 AV_WN32A(&la[n + 4], v32); \
2376 for (n = 0; n < end; n += step) \
2377 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2380 #define SPLAT_CTX(step) \
2382 SPLAT(a, end_x, step, end_x == w4); \
2383 SPLAT(l, end_y, step, end_y == h4); \
2389 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2392 MERGE_CTX(2, AV_RN16A);
2393 DECODE_Y_COEF_LOOP(2, 0,);
2397 MERGE_CTX(4, AV_RN32A);
2398 DECODE_Y_COEF_LOOP(4, 0,);
2402 MERGE_CTX(8, AV_RN64A);
2403 DECODE_Y_COEF_LOOP(8, 0, 32);
2408 #define DECODE_UV_COEF_LOOP(step, v) \
2409 for (n = 0, y = 0; y < end_y; y += step) { \
2410 for (x = 0; x < end_x; x += step, n += step * step) { \
2411 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2412 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2413 16 * step * step, c, e, p, a[x] + l[y], \
2414 uvscan, uvnb, uv_band_counts, qmul[1]); \
2415 a[x] = l[y] = !!res; \
2416 total_coeff |= !!res; \
2418 AV_WN16A(&s->uveob[pl][n], res); \
2420 s->uveob[pl][n] = res; \
2425 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2426 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2427 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2432 for (pl = 0; pl < 2; pl++) {
2433 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2434 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2437 DECODE_UV_COEF_LOOP(1,);
2440 MERGE_CTX(2, AV_RN16A);
2441 DECODE_UV_COEF_LOOP(2,);
2445 MERGE_CTX(4, AV_RN32A);
2446 DECODE_UV_COEF_LOOP(4,);
2450 MERGE_CTX(8, AV_RN64A);
2451 DECODE_UV_COEF_LOOP(8, 32);
2460 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2462 return decode_coeffs(ctx, 1);
2465 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2467 return decode_coeffs(ctx, 0);
2470 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2471 uint8_t *dst_edge, ptrdiff_t stride_edge,
2472 uint8_t *dst_inner, ptrdiff_t stride_inner,
2473 uint8_t *l, int col, int x, int w,
2474 int row, int y, enum TxfmMode tx,
2475 int p, int ss_h, int ss_v, int bytesperpixel)
2477 int have_top = row > 0 || y > 0;
2478 int have_left = col > s->tile_col_start || x > 0;
2479 int have_right = x < w - 1;
2481 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2482 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2483 { DC_127_PRED, VERT_PRED } },
2484 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2485 { HOR_PRED, HOR_PRED } },
2486 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2487 { LEFT_DC_PRED, DC_PRED } },
2488 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2489 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2490 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2491 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2492 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2493 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2494 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2495 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2496 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2497 { DC_127_PRED, VERT_LEFT_PRED } },
2498 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2499 { HOR_UP_PRED, HOR_UP_PRED } },
2500 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2501 { HOR_PRED, TM_VP8_PRED } },
2503 static const struct {
2504 uint8_t needs_left:1;
2505 uint8_t needs_top:1;
2506 uint8_t needs_topleft:1;
2507 uint8_t needs_topright:1;
2508 uint8_t invert_left:1;
2509 } edges[N_INTRA_PRED_MODES] = {
2510 [VERT_PRED] = { .needs_top = 1 },
2511 [HOR_PRED] = { .needs_left = 1 },
2512 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2513 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2514 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2515 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2516 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2517 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2518 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2519 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2520 [LEFT_DC_PRED] = { .needs_left = 1 },
2521 [TOP_DC_PRED] = { .needs_top = 1 },
2522 [DC_128_PRED] = { 0 },
2523 [DC_127_PRED] = { 0 },
2524 [DC_129_PRED] = { 0 }
2527 av_assert2(mode >= 0 && mode < 10);
2528 mode = mode_conv[mode][have_left][have_top];
2529 if (edges[mode].needs_top) {
2530 uint8_t *top, *topleft;
2531 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2532 int n_px_need_tr = 0;
2534 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2537 // if top of sb64-row, use s->intra_pred_data[] instead of
2538 // dst[-stride] for intra prediction (it contains pre- instead of
2539 // post-loopfilter data)
2541 top = !(row & 7) && !y ?
2542 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2543 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2545 topleft = !(row & 7) && !y ?
2546 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2547 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2548 &dst_inner[-stride_inner];
2552 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2553 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2554 n_px_need + n_px_need_tr <= n_px_have) {
2558 if (n_px_need <= n_px_have) {
2559 memcpy(*a, top, n_px_need * bytesperpixel);
2561 #define memset_bpp(c, i1, v, i2, num) do { \
2562 if (bytesperpixel == 1) { \
2563 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2565 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2566 for (n = 0; n < (num); n++) { \
2567 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2571 memcpy(*a, top, n_px_have * bytesperpixel);
2572 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2575 #define memset_val(c, val, num) do { \
2576 if (bytesperpixel == 1) { \
2577 memset((c), (val), (num)); \
2580 for (n = 0; n < (num); n++) { \
2581 AV_WN16A(&(c)[n * 2], (val)); \
2585 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2587 if (edges[mode].needs_topleft) {
2588 if (have_left && have_top) {
2589 #define assign_bpp(c, i1, v, i2) do { \
2590 if (bytesperpixel == 1) { \
2591 (c)[(i1)] = (v)[(i2)]; \
2593 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2596 assign_bpp(*a, -1, topleft, -1);
2598 #define assign_val(c, i, v) do { \
2599 if (bytesperpixel == 1) { \
2602 AV_WN16A(&(c)[(i) * 2], (v)); \
2605 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2608 if (tx == TX_4X4 && edges[mode].needs_topright) {
2609 if (have_top && have_right &&
2610 n_px_need + n_px_need_tr <= n_px_have) {
2611 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2613 memset_bpp(*a, 4, *a, 3, 4);
2618 if (edges[mode].needs_left) {
2620 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2621 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2622 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2624 if (edges[mode].invert_left) {
2625 if (n_px_need <= n_px_have) {
2626 for (i = 0; i < n_px_need; i++)
2627 assign_bpp(l, i, &dst[i * stride], -1);
2629 for (i = 0; i < n_px_have; i++)
2630 assign_bpp(l, i, &dst[i * stride], -1);
2631 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2634 if (n_px_need <= n_px_have) {
2635 for (i = 0; i < n_px_need; i++)
2636 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2638 for (i = 0; i < n_px_have; i++)
2639 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2640 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2644 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2651 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2652 ptrdiff_t uv_off, int bytesperpixel)
2654 VP9Context *s = ctx->priv_data;
2656 int row = s->row, col = s->col;
2657 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2658 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2659 int end_x = FFMIN(2 * (s->cols - col), w4);
2660 int end_y = FFMIN(2 * (s->rows - row), h4);
2661 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2662 int uvstep1d = 1 << b->uvtx, p;
2663 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2664 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2665 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2667 for (n = 0, y = 0; y < end_y; y += step1d) {
2668 uint8_t *ptr = dst, *ptr_r = dst_r;
2669 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2670 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2671 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2673 uint8_t *a = &a_buf[32];
2674 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2675 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2677 mode = check_intra_mode(s, mode, &a, ptr_r,
2678 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2679 ptr, s->y_stride, l,
2680 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2681 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2683 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2684 s->block + 16 * n * bytesperpixel, eob);
2686 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2687 dst += 4 * step1d * s->y_stride;
2694 step = 1 << (b->uvtx * 2);
2695 for (p = 0; p < 2; p++) {
2696 dst = s->dst[1 + p];
2697 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2698 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2699 uint8_t *ptr = dst, *ptr_r = dst_r;
2700 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2701 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2702 int mode = b->uvmode;
2703 uint8_t *a = &a_buf[32];
2704 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2706 mode = check_intra_mode(s, mode, &a, ptr_r,
2707 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2708 ptr, s->uv_stride, l, col, x, w4, row, y,
2709 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2710 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2712 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2713 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2715 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2716 dst += 4 * uvstep1d * s->uv_stride;
2721 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2723 intra_recon(ctx, y_off, uv_off, 1);
2726 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2728 intra_recon(ctx, y_off, uv_off, 2);
2731 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2732 uint8_t *dst, ptrdiff_t dst_stride,
2733 const uint8_t *ref, ptrdiff_t ref_stride,
2734 ThreadFrame *ref_frame,
2735 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2736 int bw, int bh, int w, int h, int bytesperpixel)
2738 int mx = mv->x, my = mv->y, th;
2742 ref += y * ref_stride + x * bytesperpixel;
2745 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2746 // we use +7 because the last 7 pixels of each sbrow can be changed in
2747 // the longest loopfilter of the next sbrow
2748 th = (y + bh + 4 * !!my + 7) >> 6;
2749 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2750 if (x < !!mx * 3 || y < !!my * 3 ||
2751 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2752 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2753 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2755 bw + !!mx * 7, bh + !!my * 7,
2756 x - !!mx * 3, y - !!my * 3, w, h);
2757 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2760 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2763 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2764 uint8_t *dst_u, uint8_t *dst_v,
2765 ptrdiff_t dst_stride,
2766 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2767 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2768 ThreadFrame *ref_frame,
2769 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2770 int bw, int bh, int w, int h, int bytesperpixel)
2772 int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
2776 ref_u += y * src_stride_u + x * bytesperpixel;
2777 ref_v += y * src_stride_v + x * bytesperpixel;
2780 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2781 // we use +7 because the last 7 pixels of each sbrow can be changed in
2782 // the longest loopfilter of the next sbrow
2783 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2784 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2785 if (x < !!mx * 3 || y < !!my * 3 ||
2786 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2787 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2788 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2790 bw + !!mx * 7, bh + !!my * 7,
2791 x - !!mx * 3, y - !!my * 3, w, h);
2792 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2793 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2795 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2796 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2798 bw + !!mx * 7, bh + !!my * 7,
2799 x - !!mx * 3, y - !!my * 3, w, h);
2800 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2801 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2803 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2804 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2808 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2809 px, py, pw, ph, bw, bh, w, h, i) \
2810 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2811 mv, bw, bh, w, h, bytesperpixel)
2812 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2813 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2814 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2815 row, col, mv, bw, bh, w, h, bytesperpixel)
2817 #define FN(x) x##_8bpp
2818 #define BYTES_PER_PIXEL 1
2819 #include "vp9_mc_template.c"
2821 #undef BYTES_PER_PIXEL
2822 #define FN(x) x##_16bpp
2823 #define BYTES_PER_PIXEL 2
2824 #include "vp9_mc_template.c"
2826 #undef mc_chroma_dir
2828 #undef BYTES_PER_PIXEL
2831 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2832 vp9_mc_func (*mc)[2],
2833 uint8_t *dst, ptrdiff_t dst_stride,
2834 const uint8_t *ref, ptrdiff_t ref_stride,
2835 ThreadFrame *ref_frame,
2836 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2837 int px, int py, int pw, int ph,
2838 int bw, int bh, int w, int h, int bytesperpixel,
2839 const uint16_t *scale, const uint8_t *step)
2841 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2842 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2843 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2844 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2846 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2848 int refbw_m1, refbh_m1;
2852 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2853 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2854 // BUG libvpx seems to scale the two components separately. This introduces
2855 // rounding errors but we have to reproduce them to be exactly compatible
2856 // with the output from libvpx...
2857 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2858 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2862 ref += y * ref_stride + x * bytesperpixel;
2865 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2866 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2867 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2868 // we use +7 because the last 7 pixels of each sbrow can be changed in
2869 // the longest loopfilter of the next sbrow
2870 th = (y + refbh_m1 + 4 + 7) >> 6;
2871 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2872 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2873 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2874 ref - 3 * ref_stride - 3 * bytesperpixel,
2876 refbw_m1 + 8, refbh_m1 + 8,
2877 x - 3, y - 3, w, h);
2878 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2881 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2885 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2886 vp9_mc_func (*mc)[2],
2887 uint8_t *dst_u, uint8_t *dst_v,
2888 ptrdiff_t dst_stride,
2889 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2890 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2891 ThreadFrame *ref_frame,
2892 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2893 int px, int py, int pw, int ph,
2894 int bw, int bh, int w, int h, int bytesperpixel,
2895 const uint16_t *scale, const uint8_t *step)
2897 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2898 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2899 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2900 ref_v, src_stride_v, ref_frame,
2901 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2904 int refbw_m1, refbh_m1;
2909 // BUG https://code.google.com/p/webm/issues/detail?id=820
2910 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
2911 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2913 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2914 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2917 // BUG https://code.google.com/p/webm/issues/detail?id=820
2918 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
2919 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2921 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2922 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2927 ref_u += y * src_stride_u + x * bytesperpixel;
2928 ref_v += y * src_stride_v + x * bytesperpixel;
2931 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2932 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2933 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2934 // we use +7 because the last 7 pixels of each sbrow can be changed in
2935 // the longest loopfilter of the next sbrow
2936 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2937 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2938 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2939 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2940 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2942 refbw_m1 + 8, refbh_m1 + 8,
2943 x - 3, y - 3, w, h);
2944 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2945 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2947 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2948 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2950 refbw_m1 + 8, refbh_m1 + 8,
2951 x - 3, y - 3, w, h);
2952 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2953 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2955 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2956 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2961 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2962 px, py, pw, ph, bw, bh, w, h, i) \
2963 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2964 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2965 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2966 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2967 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2968 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2969 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2970 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2972 #define FN(x) x##_scaled_8bpp
2973 #define BYTES_PER_PIXEL 1
2974 #include "vp9_mc_template.c"
2976 #undef BYTES_PER_PIXEL
2977 #define FN(x) x##_scaled_16bpp
2978 #define BYTES_PER_PIXEL 2
2979 #include "vp9_mc_template.c"
2981 #undef mc_chroma_dir
2983 #undef BYTES_PER_PIXEL
2986 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2988 VP9Context *s = ctx->priv_data;
2990 int row = s->row, col = s->col;
2992 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2993 if (bytesperpixel == 1) {
2994 inter_pred_scaled_8bpp(ctx);
2996 inter_pred_scaled_16bpp(ctx);
2999 if (bytesperpixel == 1) {
3000 inter_pred_8bpp(ctx);
3002 inter_pred_16bpp(ctx);
3006 /* mostly copied intra_recon() */
3008 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3009 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3010 int end_x = FFMIN(2 * (s->cols - col), w4);
3011 int end_y = FFMIN(2 * (s->rows - row), h4);
3012 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3013 int uvstep1d = 1 << b->uvtx, p;
3014 uint8_t *dst = s->dst[0];
3017 for (n = 0, y = 0; y < end_y; y += step1d) {
3019 for (x = 0; x < end_x; x += step1d,
3020 ptr += 4 * step1d * bytesperpixel, n += step) {
3021 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3024 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3025 s->block + 16 * n * bytesperpixel, eob);
3027 dst += 4 * s->y_stride * step1d;
3033 step = 1 << (b->uvtx * 2);
3034 for (p = 0; p < 2; p++) {
3035 dst = s->dst[p + 1];
3036 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3038 for (x = 0; x < end_x; x += uvstep1d,
3039 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3040 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3043 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3044 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3046 dst += 4 * uvstep1d * s->uv_stride;
3052 static void inter_recon_8bpp(AVCodecContext *ctx)
3054 inter_recon(ctx, 1);
3057 static void inter_recon_16bpp(AVCodecContext *ctx)
3059 inter_recon(ctx, 2);
3062 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3063 int row_and_7, int col_and_7,
3064 int w, int h, int col_end, int row_end,
3065 enum TxfmMode tx, int skip_inter)
3067 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3068 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3070 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3071 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3072 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3073 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3075 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3076 // edges. This means that for UV, we work on two subsampled blocks at
3077 // a time, and we only use the topleft block's mode information to set
3078 // things like block strength. Thus, for any block size smaller than
3079 // 16x16, ignore the odd portion of the block.
3080 if (tx == TX_4X4 && (ss_v | ss_h)) {
3095 if (tx == TX_4X4 && !skip_inter) {
3096 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3097 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3098 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3100 for (y = row_and_7; y < h + row_and_7; y++) {
3101 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3103 mask[0][y][1] |= m_row_8;
3104 mask[0][y][2] |= m_row_4;
3105 // for odd lines, if the odd col is not being filtered,
3106 // skip odd row also:
3113 // if a/c are even row/col and b/d are odd, and d is skipped,
3114 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3115 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3116 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3118 mask[1][y][col_mask_id] |= m_col;
3121 mask[0][y][3] |= m_col;
3123 if (ss_h && (col_end & 1))
3124 mask[1][y][3] |= (t << (w - 1)) - t;
3126 mask[1][y][3] |= m_col;
3130 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3133 int mask_id = (tx == TX_8X8);
3134 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3135 int l2 = tx + ss_h - 1, step1d;
3136 int m_row = m_col & masks[l2];
3138 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3139 // 8wd loopfilter to prevent going off the visible edge.
3140 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3141 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3142 int m_row_8 = m_row - m_row_16;
3144 for (y = row_and_7; y < h + row_and_7; y++) {
3145 mask[0][y][0] |= m_row_16;
3146 mask[0][y][1] |= m_row_8;
3149 for (y = row_and_7; y < h + row_and_7; y++)
3150 mask[0][y][mask_id] |= m_row;
3155 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3156 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3157 mask[1][y][0] |= m_col;
3158 if (y - row_and_7 == h - 1)
3159 mask[1][y][1] |= m_col;
3161 for (y = row_and_7; y < h + row_and_7; y += step1d)
3162 mask[1][y][mask_id] |= m_col;
3164 } else if (tx != TX_4X4) {
3167 mask_id = (tx == TX_8X8) || (h == ss_v);
3168 mask[1][row_and_7][mask_id] |= m_col;
3169 mask_id = (tx == TX_8X8) || (w == ss_h);
3170 for (y = row_and_7; y < h + row_and_7; y++)
3171 mask[0][y][mask_id] |= t;
3173 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3175 for (y = row_and_7; y < h + row_and_7; y++) {
3176 mask[0][y][2] |= t4;
3177 mask[0][y][1] |= t8;
3179 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3184 static void decode_b(AVCodecContext *ctx, int row, int col,
3185 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3186 enum BlockLevel bl, enum BlockPartition bp)
3188 VP9Context *s = ctx->priv_data;
3190 enum BlockSize bs = bl * 3 + bp;
3191 int bytesperpixel = s->bytesperpixel;
3192 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3194 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3200 s->min_mv.x = -(128 + col * 64);
3201 s->min_mv.y = -(128 + row * 64);
3202 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3203 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3209 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3210 (s->ss_v && h4 * 2 == (1 << b->tx)));
3215 if (bytesperpixel == 1) {
3216 has_coeffs = decode_coeffs_8bpp(ctx);
3218 has_coeffs = decode_coeffs_16bpp(ctx);
3220 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3222 memset(&s->above_skip_ctx[col], 1, w4);
3223 memset(&s->left_skip_ctx[s->row7], 1, h4);
3228 #define SPLAT_ZERO_CTX(v, n) \
3230 case 1: v = 0; break; \
3231 case 2: AV_ZERO16(&v); break; \
3232 case 4: AV_ZERO32(&v); break; \
3233 case 8: AV_ZERO64(&v); break; \
3234 case 16: AV_ZERO128(&v); break; \
3236 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3238 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3239 if (s->ss_##dir2) { \
3240 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3241 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3243 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3244 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3249 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3250 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3251 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3252 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3255 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3256 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3257 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3258 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3264 s->block += w4 * h4 * 64 * bytesperpixel;
3265 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3266 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3267 s->eob += 4 * w4 * h4;
3268 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3269 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3275 // emulated overhangs if the stride of the target buffer can't hold. This
3276 // makes it possible to support emu-edge and so on even if we have large block
3278 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3279 (row + h4) > s->rows;
3280 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3281 (row + h4) > s->rows;
3283 s->dst[0] = s->tmp_y;
3286 s->dst[0] = f->data[0] + yoff;
3287 s->y_stride = f->linesize[0];
3290 s->dst[1] = s->tmp_uv[0];
3291 s->dst[2] = s->tmp_uv[1];
3294 s->dst[1] = f->data[1] + uvoff;
3295 s->dst[2] = f->data[2] + uvoff;
3296 s->uv_stride = f->linesize[1];
3300 intra_recon_16bpp(ctx, yoff, uvoff);
3302 intra_recon_8bpp(ctx, yoff, uvoff);
3306 inter_recon_16bpp(ctx);
3308 inter_recon_8bpp(ctx);
3312 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3314 for (n = 0; o < w; n++) {
3319 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3320 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3326 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3327 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3329 for (n = s->ss_h; o < w; n++) {
3334 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3335 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3336 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3337 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3343 // pick filter level and find edges to apply filter to
3344 if (s->s.h.filter.level &&
3345 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3346 [b->mode[3] != ZEROMV]) > 0) {
3347 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3348 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3350 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3351 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3352 if (s->ss_h || s->ss_v)
3353 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3354 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3355 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3356 b->uvtx, skip_inter);
3358 if (!s->filter_lut.lim_lut[lvl]) {
3359 int sharp = s->s.h.filter.sharpness;
3363 limit >>= (sharp + 3) >> 2;
3364 limit = FFMIN(limit, 9 - sharp);
3366 limit = FFMAX(limit, 1);
3368 s->filter_lut.lim_lut[lvl] = limit;
3369 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3375 s->block += w4 * h4 * 64 * bytesperpixel;
3376 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3377 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3378 s->eob += 4 * w4 * h4;
3379 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3380 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3384 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3385 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3387 VP9Context *s = ctx->priv_data;
3388 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3389 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3390 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3391 s->prob.p.partition[bl][c];
3392 enum BlockPartition bp;
3393 ptrdiff_t hbs = 4 >> bl;
3394 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3395 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3396 int bytesperpixel = s->bytesperpixel;
3399 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3400 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3401 } else if (col + hbs < s->cols) { // FIXME why not <=?
3402 if (row + hbs < s->rows) { // FIXME why not <=?
3403 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3405 case PARTITION_NONE:
3406 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3409 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3410 yoff += hbs * 8 * y_stride;
3411 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3412 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3415 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3416 yoff += hbs * 8 * bytesperpixel;
3417 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3418 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3420 case PARTITION_SPLIT:
3421 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3422 decode_sb(ctx, row, col + hbs, lflvl,
3423 yoff + 8 * hbs * bytesperpixel,
3424 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3425 yoff += hbs * 8 * y_stride;
3426 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3427 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3428 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3429 yoff + 8 * hbs * bytesperpixel,
3430 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3435 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3436 bp = PARTITION_SPLIT;
3437 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3438 decode_sb(ctx, row, col + hbs, lflvl,
3439 yoff + 8 * hbs * bytesperpixel,
3440 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3443 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3445 } else if (row + hbs < s->rows) { // FIXME why not <=?
3446 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3447 bp = PARTITION_SPLIT;
3448 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3449 yoff += hbs * 8 * y_stride;
3450 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3451 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3454 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3457 bp = PARTITION_SPLIT;
3458 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3460 s->counts.partition[bl][c][bp]++;
3463 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3464 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3466 VP9Context *s = ctx->priv_data;
3468 ptrdiff_t hbs = 4 >> bl;
3469 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3470 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3471 int bytesperpixel = s->bytesperpixel;
3474 av_assert2(b->bl == BL_8X8);
3475 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3476 } else if (s->b->bl == bl) {
3477 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3478 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3479 yoff += hbs * 8 * y_stride;
3480 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3481 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3482 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3483 yoff += hbs * 8 * bytesperpixel;
3484 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3485 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3488 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3489 if (col + hbs < s->cols) { // FIXME why not <=?
3490 if (row + hbs < s->rows) {
3491 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3492 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3493 yoff += hbs * 8 * y_stride;
3494 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3495 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3496 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3497 yoff + 8 * hbs * bytesperpixel,
3498 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3500 yoff += hbs * 8 * bytesperpixel;
3501 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3502 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3504 } else if (row + hbs < s->rows) {
3505 yoff += hbs * 8 * y_stride;
3506 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3507 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3512 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3513 uint8_t *lvl, uint8_t (*mask)[4],
3514 uint8_t *dst, ptrdiff_t ls)
3516 int y, x, bytesperpixel = s->bytesperpixel;
3518 // filter edges between columns (e.g. block1 | block2)
3519 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3520 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3521 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3522 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3523 unsigned hm = hm1 | hm2 | hm13 | hm23;
3525 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3528 int L = *l, H = L >> 4;
3529 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3531 if (hmask1[0] & x) {
3532 if (hmask2[0] & x) {
3533 av_assert2(l[8 << ss_v] == L);
3534 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3536 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3538 } else if (hm2 & x) {
3541 E |= s->filter_lut.mblim_lut[L] << 8;
3542 I |= s->filter_lut.lim_lut[L] << 8;
3543 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3545 [0](ptr, ls, E, I, H);
3547 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3548 [0](ptr, ls, E, I, H);
3550 } else if (hm2 & x) {
3551 int L = l[8 << ss_v], H = L >> 4;
3552 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3554 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3555 [0](ptr + 8 * ls, ls, E, I, H);
3563 int L = *l, H = L >> 4;
3564 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3569 E |= s->filter_lut.mblim_lut[L] << 8;
3570 I |= s->filter_lut.lim_lut[L] << 8;
3571 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3573 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3575 } else if (hm23 & x) {
3576 int L = l[8 << ss_v], H = L >> 4;
3577 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3579 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3587 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3588 uint8_t *lvl, uint8_t (*mask)[4],
3589 uint8_t *dst, ptrdiff_t ls)
3591 int y, x, bytesperpixel = s->bytesperpixel;
3594 // filter edges between rows (e.g. ------)
3596 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3597 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3598 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3600 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3603 int L = *l, H = L >> 4;
3604 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3607 if (vmask[0] & (x << (1 + ss_h))) {
3608 av_assert2(l[1 + ss_h] == L);
3609 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3611 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3613 } else if (vm & (x << (1 + ss_h))) {
3616 E |= s->filter_lut.mblim_lut[L] << 8;
3617 I |= s->filter_lut.lim_lut[L] << 8;
3618 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3619 [!!(vmask[1] & (x << (1 + ss_h)))]
3620 [1](ptr, ls, E, I, H);
3622 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3623 [1](ptr, ls, E, I, H);
3625 } else if (vm & (x << (1 + ss_h))) {
3626 int L = l[1 + ss_h], H = L >> 4;
3627 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3629 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3630 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3635 int L = *l, H = L >> 4;
3636 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3638 if (vm3 & (x << (1 + ss_h))) {
3641 E |= s->filter_lut.mblim_lut[L] << 8;
3642 I |= s->filter_lut.lim_lut[L] << 8;
3643 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3645 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3647 } else if (vm3 & (x << (1 + ss_h))) {
3648 int L = l[1 + ss_h], H = L >> 4;
3649 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3651 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3664 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3665 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3667 VP9Context *s = ctx->priv_data;
3668 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3669 uint8_t *dst = f->data[0] + yoff;
3670 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3671 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3674 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3675 // if you think of them as acting on a 8x8 block max, we can interleave
3676 // each v/h within the single x loop, but that only works if we work on
3677 // 8 pixel blocks, and we won't always do that (we want at least 16px
3678 // to use SSE2 optimizations, perhaps 32 for AVX2)
3680 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3681 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3683 for (p = 0; p < 2; p++) {
3684 dst = f->data[1 + p] + uvoff;
3685 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3686 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3690 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3692 int sb_start = ( idx * n) >> log2_n;
3693 int sb_end = ((idx + 1) * n) >> log2_n;
3694 *start = FFMIN(sb_start, n) << 3;
3695 *end = FFMIN(sb_end, n) << 3;
3698 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3699 int max_count, int update_factor)
3701 unsigned ct = ct0 + ct1, p2, p1;
3707 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3708 p2 = av_clip(p2, 1, 255);
3709 ct = FFMIN(ct, max_count);
3710 update_factor = FASTDIV(update_factor * ct, max_count);
3712 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3713 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3716 static void adapt_probs(VP9Context *s)
3719 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3720 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3723 for (i = 0; i < 4; i++)
3724 for (j = 0; j < 2; j++)
3725 for (k = 0; k < 2; k++)
3726 for (l = 0; l < 6; l++)
3727 for (m = 0; m < 6; m++) {
3728 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3729 unsigned *e = s->counts.eob[i][j][k][l][m];
3730 unsigned *c = s->counts.coef[i][j][k][l][m];
3732 if (l == 0 && m >= 3) // dc only has 3 pt
3735 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3736 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3737 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3740 if (s->s.h.keyframe || s->s.h.intraonly) {
3741 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3742 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3743 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3744 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3749 for (i = 0; i < 3; i++)
3750 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3753 for (i = 0; i < 4; i++)
3754 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3757 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3758 for (i = 0; i < 5; i++)
3759 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3763 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3764 for (i = 0; i < 5; i++)
3765 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3766 s->counts.comp_ref[i][1], 20, 128);
3769 if (s->s.h.comppredmode != PRED_COMPREF) {
3770 for (i = 0; i < 5; i++) {
3771 uint8_t *pp = p->single_ref[i];
3772 unsigned (*c)[2] = s->counts.single_ref[i];
3774 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3775 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3779 // block partitioning
3780 for (i = 0; i < 4; i++)
3781 for (j = 0; j < 4; j++) {
3782 uint8_t *pp = p->partition[i][j];
3783 unsigned *c = s->counts.partition[i][j];
3785 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3786 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3787 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3791 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3792 for (i = 0; i < 2; i++) {
3793 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3795 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3796 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3797 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3798 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3799 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3800 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3804 // interpolation filter
3805 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3806 for (i = 0; i < 4; i++) {
3807 uint8_t *pp = p->filter[i];
3808 unsigned *c = s->counts.filter[i];
3810 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3811 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3816 for (i = 0; i < 7; i++) {
3817 uint8_t *pp = p->mv_mode[i];
3818 unsigned *c = s->counts.mv_mode[i];
3820 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3821 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3822 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3827 uint8_t *pp = p->mv_joint;
3828 unsigned *c = s->counts.mv_joint;
3830 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3831 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3832 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3836 for (i = 0; i < 2; i++) {
3838 unsigned *c, (*c2)[2], sum;
3840 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3841 s->counts.mv_comp[i].sign[1], 20, 128);
3843 pp = p->mv_comp[i].classes;
3844 c = s->counts.mv_comp[i].classes;
3845 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3846 adapt_prob(&pp[0], c[0], sum, 20, 128);
3848 adapt_prob(&pp[1], c[1], sum, 20, 128);
3850 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3851 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3853 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3854 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3856 adapt_prob(&pp[6], c[6], sum, 20, 128);
3857 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3858 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3859 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3861 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3862 s->counts.mv_comp[i].class0[1], 20, 128);
3863 pp = p->mv_comp[i].bits;
3864 c2 = s->counts.mv_comp[i].bits;
3865 for (j = 0; j < 10; j++)
3866 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3868 for (j = 0; j < 2; j++) {
3869 pp = p->mv_comp[i].class0_fp[j];
3870 c = s->counts.mv_comp[i].class0_fp[j];
3871 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3872 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3873 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3875 pp = p->mv_comp[i].fp;
3876 c = s->counts.mv_comp[i].fp;
3877 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3878 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3879 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3881 if (s->s.h.highprecisionmvs) {
3882 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3883 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3884 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3885 s->counts.mv_comp[i].hp[1], 20, 128);
3890 for (i = 0; i < 4; i++) {
3891 uint8_t *pp = p->y_mode[i];
3892 unsigned *c = s->counts.y_mode[i], sum, s2;
3894 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3895 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3896 sum -= c[TM_VP8_PRED];
3897 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3898 sum -= c[VERT_PRED];
3899 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3900 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3902 adapt_prob(&pp[3], s2, sum, 20, 128);
3904 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3905 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3906 sum -= c[DIAG_DOWN_LEFT_PRED];
3907 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3908 sum -= c[VERT_LEFT_PRED];
3909 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3910 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3914 for (i = 0; i < 10; i++) {
3915 uint8_t *pp = p->uv_mode[i];
3916 unsigned *c = s->counts.uv_mode[i], sum, s2;
3918 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3919 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3920 sum -= c[TM_VP8_PRED];
3921 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3922 sum -= c[VERT_PRED];
3923 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3924 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3926 adapt_prob(&pp[3], s2, sum, 20, 128);
3928 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3929 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3930 sum -= c[DIAG_DOWN_LEFT_PRED];
3931 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3932 sum -= c[VERT_LEFT_PRED];
3933 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3934 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3938 static void free_buffers(VP9Context *s)
3940 av_freep(&s->intra_pred_data[0]);
3941 av_freep(&s->b_base);
3942 av_freep(&s->block_base);
3945 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3947 VP9Context *s = ctx->priv_data;
3950 for (i = 0; i < 3; i++) {
3951 if (s->s.frames[i].tf.f->buf[0])
3952 vp9_unref_frame(ctx, &s->s.frames[i]);
3953 av_frame_free(&s->s.frames[i].tf.f);
3955 for (i = 0; i < 8; i++) {
3956 if (s->s.refs[i].f->buf[0])
3957 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3958 av_frame_free(&s->s.refs[i].f);
3959 if (s->next_refs[i].f->buf[0])
3960 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3961 av_frame_free(&s->next_refs[i].f);
3971 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3972 int *got_frame, AVPacket *pkt)
3974 const uint8_t *data = pkt->data;
3975 int size = pkt->size;
3976 VP9Context *s = ctx->priv_data;
3977 int res, tile_row, tile_col, i, ref, row, col;
3978 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3979 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3980 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3984 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3986 } else if (res == 0) {
3987 if (!s->s.refs[ref].f->buf[0]) {
3988 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3989 return AVERROR_INVALIDDATA;
3991 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3993 ((AVFrame *)frame)->pkt_pts = pkt->pts;
3994 ((AVFrame *)frame)->pkt_dts = pkt->dts;
3995 for (i = 0; i < 8; i++) {
3996 if (s->next_refs[i].f->buf[0])
3997 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3998 if (s->s.refs[i].f->buf[0] &&
3999 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4008 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4009 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4010 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4011 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4012 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4015 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4016 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
4017 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4018 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4020 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4021 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4022 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4024 f = s->s.frames[CUR_FRAME].tf.f;
4025 f->key_frame = s->s.h.keyframe;
4026 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4027 ls_y = f->linesize[0];
4028 ls_uv =f->linesize[1];
4030 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4031 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4032 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4033 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4037 for (i = 0; i < 8; i++) {
4038 if (s->next_refs[i].f->buf[0])
4039 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4040 if (s->s.h.refreshrefmask & (1 << i)) {
4041 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4042 } else if (s->s.refs[i].f->buf[0]) {
4043 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4050 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4053 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4056 res = ctx->hwaccel->end_frame(ctx);
4062 // main tile decode loop
4063 bytesperpixel = s->bytesperpixel;
4064 memset(s->above_partition_ctx, 0, s->cols);
4065 memset(s->above_skip_ctx, 0, s->cols);
4066 if (s->s.h.keyframe || s->s.h.intraonly) {
4067 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4069 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4071 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4072 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4073 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4074 memset(s->above_segpred_ctx, 0, s->cols);
4075 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4076 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4077 if ((res = update_block_buffers(ctx)) < 0) {
4078 av_log(ctx, AV_LOG_ERROR,
4079 "Failed to allocate block buffers\n");
4082 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4085 for (i = 0; i < 4; i++) {
4086 for (j = 0; j < 2; j++)
4087 for (k = 0; k < 2; k++)
4088 for (l = 0; l < 6; l++)
4089 for (m = 0; m < 6; m++)
4090 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4091 s->prob.coef[i][j][k][l][m], 3);
4092 if (s->s.h.txfmmode == i)
4095 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4096 ff_thread_finish_setup(ctx);
4097 } else if (!s->s.h.refreshctx) {
4098 ff_thread_finish_setup(ctx);
4104 s->block = s->block_base;
4105 s->uvblock[0] = s->uvblock_base[0];
4106 s->uvblock[1] = s->uvblock_base[1];
4107 s->eob = s->eob_base;
4108 s->uveob[0] = s->uveob_base[0];
4109 s->uveob[1] = s->uveob_base[1];
4111 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4112 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4113 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4115 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4118 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4119 tile_row == s->s.h.tiling.tile_rows - 1) {
4122 tile_size = AV_RB32(data);
4126 if (tile_size > size) {
4127 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4128 return AVERROR_INVALIDDATA;
4130 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4131 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4132 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4133 return AVERROR_INVALIDDATA;
4140 for (row = s->tile_row_start; row < s->tile_row_end;
4141 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4142 struct VP9Filter *lflvl_ptr = s->lflvl;
4143 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4145 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4146 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4147 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4150 memset(s->left_partition_ctx, 0, 8);
4151 memset(s->left_skip_ctx, 0, 8);
4152 if (s->s.h.keyframe || s->s.h.intraonly) {
4153 memset(s->left_mode_ctx, DC_PRED, 16);
4155 memset(s->left_mode_ctx, NEARESTMV, 8);
4157 memset(s->left_y_nnz_ctx, 0, 16);
4158 memset(s->left_uv_nnz_ctx, 0, 32);
4159 memset(s->left_segpred_ctx, 0, 8);
4161 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4164 for (col = s->tile_col_start;
4165 col < s->tile_col_end;
4166 col += 8, yoff2 += 64 * bytesperpixel,
4167 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4168 // FIXME integrate with lf code (i.e. zero after each
4169 // use, similar to invtxfm coefficients, or similar)
4171 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4175 decode_sb_mem(ctx, row, col, lflvl_ptr,
4176 yoff2, uvoff2, BL_64X64);
4178 decode_sb(ctx, row, col, lflvl_ptr,
4179 yoff2, uvoff2, BL_64X64);
4183 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4191 // backup pre-loopfilter reconstruction data for intra
4192 // prediction of next row of sb64s
4193 if (row + 8 < s->rows) {
4194 memcpy(s->intra_pred_data[0],
4195 f->data[0] + yoff + 63 * ls_y,
4196 8 * s->cols * bytesperpixel);
4197 memcpy(s->intra_pred_data[1],
4198 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4199 8 * s->cols * bytesperpixel >> s->ss_h);
4200 memcpy(s->intra_pred_data[2],
4201 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4202 8 * s->cols * bytesperpixel >> s->ss_h);
4205 // loopfilter one row
4206 if (s->s.h.filter.level) {
4209 lflvl_ptr = s->lflvl;
4210 for (col = 0; col < s->cols;
4211 col += 8, yoff2 += 64 * bytesperpixel,
4212 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4213 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4217 // FIXME maybe we can make this more finegrained by running the
4218 // loopfilter per-block instead of after each sbrow
4219 // In fact that would also make intra pred left preparation easier?
4220 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4224 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4226 ff_thread_finish_setup(ctx);
4228 } while (s->pass++ == 1);
4229 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4233 for (i = 0; i < 8; i++) {
4234 if (s->s.refs[i].f->buf[0])
4235 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4236 if (s->next_refs[i].f->buf[0] &&
4237 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4241 if (!s->s.h.invisible) {
4242 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4250 static void vp9_decode_flush(AVCodecContext *ctx)
4252 VP9Context *s = ctx->priv_data;
4255 for (i = 0; i < 3; i++)
4256 vp9_unref_frame(ctx, &s->s.frames[i]);
4257 for (i = 0; i < 8; i++)
4258 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4261 static int init_frames(AVCodecContext *ctx)
4263 VP9Context *s = ctx->priv_data;
4266 for (i = 0; i < 3; i++) {
4267 s->s.frames[i].tf.f = av_frame_alloc();
4268 if (!s->s.frames[i].tf.f) {
4269 vp9_decode_free(ctx);
4270 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4271 return AVERROR(ENOMEM);
4274 for (i = 0; i < 8; i++) {
4275 s->s.refs[i].f = av_frame_alloc();
4276 s->next_refs[i].f = av_frame_alloc();
4277 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4278 vp9_decode_free(ctx);
4279 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4280 return AVERROR(ENOMEM);
4287 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4289 VP9Context *s = ctx->priv_data;
4291 ctx->internal->allocate_progress = 1;
4293 s->s.h.filter.sharpness = -1;
4295 return init_frames(ctx);
4299 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4301 return init_frames(avctx);
4304 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4307 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4309 for (i = 0; i < 3; i++) {
4310 if (s->s.frames[i].tf.f->buf[0])
4311 vp9_unref_frame(dst, &s->s.frames[i]);
4312 if (ssrc->s.frames[i].tf.f->buf[0]) {
4313 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4317 for (i = 0; i < 8; i++) {
4318 if (s->s.refs[i].f->buf[0])
4319 ff_thread_release_buffer(dst, &s->s.refs[i]);
4320 if (ssrc->next_refs[i].f->buf[0]) {
4321 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4326 s->s.h.invisible = ssrc->s.h.invisible;
4327 s->s.h.keyframe = ssrc->s.h.keyframe;
4328 s->s.h.intraonly = ssrc->s.h.intraonly;
4329 s->ss_v = ssrc->ss_v;
4330 s->ss_h = ssrc->ss_h;
4331 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4332 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4333 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4334 s->bytesperpixel = ssrc->bytesperpixel;
4335 s->gf_fmt = ssrc->gf_fmt;
4339 s->bpp_index = ssrc->bpp_index;
4340 s->pix_fmt = ssrc->pix_fmt;
4341 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4342 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4343 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4344 sizeof(s->s.h.segmentation.feat));
4350 AVCodec ff_vp9_decoder = {
4352 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4353 .type = AVMEDIA_TYPE_VIDEO,
4354 .id = AV_CODEC_ID_VP9,
4355 .priv_data_size = sizeof(VP9Context),
4356 .init = vp9_decode_init,
4357 .close = vp9_decode_free,
4358 .decode = vp9_decode_frame,
4359 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4360 .flush = vp9_decode_flush,
4361 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4362 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4363 .profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),