2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
37 #define VP9_SYNCCODE 0x498342
41 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
45 typedef struct VP9Block {
46 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
47 enum FilterMode filter;
48 VP56mv mv[4 /* b_idx */][2 /* ref */];
50 enum TxfmMode tx, uvtx;
52 enum BlockPartition bp;
55 typedef struct VP9Context {
66 int row, row7, col, col7;
68 ptrdiff_t y_stride, uv_stride;
71 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
72 uint8_t last_keyframe;
73 // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74 // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75 // and are therefore per-stream. pix_fmt represents the value in the header
76 // of the currently processed frame.
78 enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79 unsigned sb_cols, sb_rows, rows, cols;
80 ThreadFrame next_refs[8];
84 uint8_t mblim_lut[64];
86 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
89 uint8_t coef[4][2][2][6][6][3];
93 uint8_t coef[4][2][2][6][6][11];
96 unsigned y_mode[4][10];
97 unsigned uv_mode[10][10];
98 unsigned filter[4][3];
99 unsigned mv_mode[7][4];
100 unsigned intra[4][2];
102 unsigned single_ref[5][2][2];
103 unsigned comp_ref[5][2];
104 unsigned tx32p[2][4];
105 unsigned tx16p[2][3];
108 unsigned mv_joint[4];
111 unsigned classes[11];
113 unsigned bits[10][2];
114 unsigned class0_fp[2][4];
116 unsigned class0_hp[2];
119 unsigned partition[4][4][4];
120 unsigned coef[4][2][2][6][6][3];
121 unsigned eob[4][2][2][6][6][2];
124 // contextual (left/above) cache
125 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
126 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
127 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
128 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
129 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
131 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
132 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
133 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
134 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
135 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
136 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
137 uint8_t *above_partition_ctx;
138 uint8_t *above_mode_ctx;
139 // FIXME maybe merge some of the below in a flags field?
140 uint8_t *above_y_nnz_ctx;
141 uint8_t *above_uv_nnz_ctx[2];
142 uint8_t *above_skip_ctx; // 1bit
143 uint8_t *above_txfm_ctx; // 2bit
144 uint8_t *above_segpred_ctx; // 1bit
145 uint8_t *above_intra_ctx; // 1bit
146 uint8_t *above_comp_ctx; // 1bit
147 uint8_t *above_ref_ctx; // 2bit
148 uint8_t *above_filter_ctx;
149 VP56mv (*above_mv_ctx)[2];
152 uint8_t *intra_pred_data[3];
153 struct VP9Filter *lflvl;
154 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
156 // block reconstruction intermediates
157 int block_alloc_using_2pass;
158 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
159 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
160 struct { int x, y; } min_mv, max_mv;
161 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163 uint16_t mvscale[3][2];
164 uint8_t mvstep[3][2];
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
169 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
172 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
177 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
179 ff_thread_release_buffer(ctx, &f->tf);
180 av_buffer_unref(&f->extradata);
181 av_buffer_unref(&f->hwaccel_priv_buf);
182 f->segmentation_map = NULL;
183 f->hwaccel_picture_private = NULL;
186 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
188 VP9Context *s = ctx->priv_data;
191 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
193 sz = 64 * s->sb_cols * s->sb_rows;
194 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
198 f->segmentation_map = f->extradata->data;
199 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
202 const AVHWAccel *hwaccel = ctx->hwaccel;
203 av_assert0(!f->hwaccel_picture_private);
204 if (hwaccel->frame_priv_data_size) {
205 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
206 if (!f->hwaccel_priv_buf)
208 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
215 vp9_unref_frame(ctx, f);
216 return AVERROR(ENOMEM);
219 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
223 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
225 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
229 dst->segmentation_map = src->segmentation_map;
231 dst->uses_2pass = src->uses_2pass;
233 if (src->hwaccel_picture_private) {
234 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
235 if (!dst->hwaccel_priv_buf)
237 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
243 vp9_unref_frame(ctx, dst);
244 return AVERROR(ENOMEM);
247 static int update_size(AVCodecContext *ctx, int w, int h)
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251 VP9Context *s = ctx->priv_data;
253 int bytesperpixel = s->bytesperpixel, res, cols, rows;
255 av_assert0(w > 0 && h > 0);
257 if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
261 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
262 #if CONFIG_VP9_DXVA2_HWACCEL
263 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
265 #if CONFIG_VP9_D3D11VA_HWACCEL
266 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
268 #if CONFIG_VP9_VAAPI_HWACCEL
269 *fmtp++ = AV_PIX_FMT_VAAPI;
273 *fmtp++ = s->pix_fmt;
274 *fmtp = AV_PIX_FMT_NONE;
276 res = ff_thread_get_format(ctx, pix_fmts);
281 s->gf_fmt = s->pix_fmt;
289 if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
292 s->last_fmt = s->pix_fmt;
293 s->sb_cols = (w + 63) >> 6;
294 s->sb_rows = (h + 63) >> 6;
295 s->cols = (w + 7) >> 3;
296 s->rows = (h + 7) >> 3;
298 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
299 av_freep(&s->intra_pred_data[0]);
300 // FIXME we slightly over-allocate here for subsampled chroma, but a little
301 // bit of padding shouldn't affect performance...
302 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
303 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
305 return AVERROR(ENOMEM);
306 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
307 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
308 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
309 assign(s->above_y_nnz_ctx, uint8_t *, 16);
310 assign(s->above_mode_ctx, uint8_t *, 16);
311 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
312 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
313 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
314 assign(s->above_partition_ctx, uint8_t *, 8);
315 assign(s->above_skip_ctx, uint8_t *, 8);
316 assign(s->above_txfm_ctx, uint8_t *, 8);
317 assign(s->above_segpred_ctx, uint8_t *, 8);
318 assign(s->above_intra_ctx, uint8_t *, 8);
319 assign(s->above_comp_ctx, uint8_t *, 8);
320 assign(s->above_ref_ctx, uint8_t *, 8);
321 assign(s->above_filter_ctx, uint8_t *, 8);
322 assign(s->lflvl, struct VP9Filter *, 1);
325 // these will be re-allocated a little later
326 av_freep(&s->b_base);
327 av_freep(&s->block_base);
329 if (s->bpp != s->last_bpp) {
330 ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
331 ff_videodsp_init(&s->vdsp, s->bpp);
332 s->last_bpp = s->bpp;
338 static int update_block_buffers(AVCodecContext *ctx)
340 VP9Context *s = ctx->priv_data;
341 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
343 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
347 av_free(s->block_base);
348 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
349 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
350 if (s->s.frames[CUR_FRAME].uses_2pass) {
351 int sbs = s->sb_cols * s->sb_rows;
353 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
354 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
355 16 * 16 + 2 * chroma_eobs) * sbs);
356 if (!s->b_base || !s->block_base)
357 return AVERROR(ENOMEM);
358 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
359 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
360 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
361 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
362 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
364 s->b_base = av_malloc(sizeof(VP9Block));
365 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
366 16 * 16 + 2 * chroma_eobs);
367 if (!s->b_base || !s->block_base)
368 return AVERROR(ENOMEM);
369 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
370 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
371 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
372 s->uveob_base[0] = s->eob_base + 16 * 16;
373 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
375 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
380 // for some reason the sign bit is at the end, not the start, of a bit sequence
381 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
383 int v = get_bits(gb, n);
384 return get_bits1(gb) ? -v : v;
387 static av_always_inline int inv_recenter_nonneg(int v, int m)
389 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
392 // differential forward probability updates
393 static int update_prob(VP56RangeCoder *c, int p)
395 static const int inv_map_table[255] = {
396 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
397 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
398 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
399 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
400 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
401 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
402 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
403 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
404 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
405 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
406 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
407 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
408 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
409 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
410 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
411 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
412 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
413 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
418 /* This code is trying to do a differential probability update. For a
419 * current probability A in the range [1, 255], the difference to a new
420 * probability of any value can be expressed differentially as 1-A,255-A
421 * where some part of this (absolute range) exists both in positive as
422 * well as the negative part, whereas another part only exists in one
423 * half. We're trying to code this shared part differentially, i.e.
424 * times two where the value of the lowest bit specifies the sign, and
425 * the single part is then coded on top of this. This absolute difference
426 * then again has a value of [0,254], but a bigger value in this range
427 * indicates that we're further away from the original value A, so we
428 * can code this as a VLC code, since higher values are increasingly
429 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
430 * updates vs. the 'fine, exact' updates further down the range, which
431 * adds one extra dimension to this differential update model. */
433 if (!vp8_rac_get(c)) {
434 d = vp8_rac_get_uint(c, 4) + 0;
435 } else if (!vp8_rac_get(c)) {
436 d = vp8_rac_get_uint(c, 4) + 16;
437 } else if (!vp8_rac_get(c)) {
438 d = vp8_rac_get_uint(c, 5) + 32;
440 d = vp8_rac_get_uint(c, 7);
442 d = (d << 1) - 65 + vp8_rac_get(c);
444 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
447 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
448 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
451 static int read_colorspace_details(AVCodecContext *ctx)
453 static const enum AVColorSpace colorspaces[8] = {
454 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
455 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
457 VP9Context *s = ctx->priv_data;
458 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
461 s->bpp = 8 + bits * 2;
462 s->bytesperpixel = (7 + s->bpp) >> 3;
463 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
464 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
465 static const enum AVPixelFormat pix_fmt_rgb[3] = {
466 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
468 s->ss_h = s->ss_v = 0;
469 ctx->color_range = AVCOL_RANGE_JPEG;
470 s->pix_fmt = pix_fmt_rgb[bits];
471 if (ctx->profile & 1) {
472 if (get_bits1(&s->gb)) {
473 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
474 return AVERROR_INVALIDDATA;
477 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
479 return AVERROR_INVALIDDATA;
482 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
483 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
484 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
485 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
486 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
487 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
488 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
490 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
491 if (ctx->profile & 1) {
492 s->ss_h = get_bits1(&s->gb);
493 s->ss_v = get_bits1(&s->gb);
494 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
495 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
496 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
498 return AVERROR_INVALIDDATA;
499 } else if (get_bits1(&s->gb)) {
500 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
502 return AVERROR_INVALIDDATA;
505 s->ss_h = s->ss_v = 1;
506 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
513 static int decode_frame_header(AVCodecContext *ctx,
514 const uint8_t *data, int size, int *ref)
516 VP9Context *s = ctx->priv_data;
517 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
519 const uint8_t *data2;
522 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
523 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
526 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
527 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
528 return AVERROR_INVALIDDATA;
530 ctx->profile = get_bits1(&s->gb);
531 ctx->profile |= get_bits1(&s->gb) << 1;
532 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
533 if (ctx->profile > 3) {
534 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
535 return AVERROR_INVALIDDATA;
537 s->s.h.profile = ctx->profile;
538 if (get_bits1(&s->gb)) {
539 *ref = get_bits(&s->gb, 3);
542 s->last_keyframe = s->s.h.keyframe;
543 s->s.h.keyframe = !get_bits1(&s->gb);
544 last_invisible = s->s.h.invisible;
545 s->s.h.invisible = !get_bits1(&s->gb);
546 s->s.h.errorres = get_bits1(&s->gb);
547 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
548 if (s->s.h.keyframe) {
549 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
550 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
551 return AVERROR_INVALIDDATA;
553 if ((res = read_colorspace_details(ctx)) < 0)
555 // for profile 1, here follows the subsampling bits
556 s->s.h.refreshrefmask = 0xff;
557 w = get_bits(&s->gb, 16) + 1;
558 h = get_bits(&s->gb, 16) + 1;
559 if (get_bits1(&s->gb)) // display size
560 skip_bits(&s->gb, 32);
562 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
563 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
564 if (s->s.h.intraonly) {
565 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
566 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
567 return AVERROR_INVALIDDATA;
569 if (ctx->profile >= 1) {
570 if ((res = read_colorspace_details(ctx)) < 0)
573 s->ss_h = s->ss_v = 1;
576 s->bytesperpixel = 1;
577 s->pix_fmt = AV_PIX_FMT_YUV420P;
578 ctx->colorspace = AVCOL_SPC_BT470BG;
579 ctx->color_range = AVCOL_RANGE_JPEG;
581 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
582 w = get_bits(&s->gb, 16) + 1;
583 h = get_bits(&s->gb, 16) + 1;
584 if (get_bits1(&s->gb)) // display size
585 skip_bits(&s->gb, 32);
587 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
588 s->s.h.refidx[0] = get_bits(&s->gb, 3);
589 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
590 s->s.h.refidx[1] = get_bits(&s->gb, 3);
591 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
592 s->s.h.refidx[2] = get_bits(&s->gb, 3);
593 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
594 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
595 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
596 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
597 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
598 return AVERROR_INVALIDDATA;
600 if (get_bits1(&s->gb)) {
601 w = s->s.refs[s->s.h.refidx[0]].f->width;
602 h = s->s.refs[s->s.h.refidx[0]].f->height;
603 } else if (get_bits1(&s->gb)) {
604 w = s->s.refs[s->s.h.refidx[1]].f->width;
605 h = s->s.refs[s->s.h.refidx[1]].f->height;
606 } else if (get_bits1(&s->gb)) {
607 w = s->s.refs[s->s.h.refidx[2]].f->width;
608 h = s->s.refs[s->s.h.refidx[2]].f->height;
610 w = get_bits(&s->gb, 16) + 1;
611 h = get_bits(&s->gb, 16) + 1;
613 // Note that in this code, "CUR_FRAME" is actually before we
614 // have formally allocated a frame, and thus actually represents
616 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
617 s->s.frames[CUR_FRAME].tf.f->height == h;
618 if (get_bits1(&s->gb)) // display size
619 skip_bits(&s->gb, 32);
620 s->s.h.highprecisionmvs = get_bits1(&s->gb);
621 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
623 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
624 s->s.h.signbias[0] != s->s.h.signbias[2];
625 if (s->s.h.allowcompinter) {
626 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
627 s->s.h.fixcompref = 2;
628 s->s.h.varcompref[0] = 0;
629 s->s.h.varcompref[1] = 1;
630 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
631 s->s.h.fixcompref = 1;
632 s->s.h.varcompref[0] = 0;
633 s->s.h.varcompref[1] = 2;
635 s->s.h.fixcompref = 0;
636 s->s.h.varcompref[0] = 1;
637 s->s.h.varcompref[1] = 2;
642 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
643 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
644 s->s.h.framectxid = c = get_bits(&s->gb, 2);
645 if (s->s.h.keyframe || s->s.h.intraonly)
646 s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
648 /* loopfilter header data */
649 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
650 // reset loopfilter defaults
651 s->s.h.lf_delta.ref[0] = 1;
652 s->s.h.lf_delta.ref[1] = 0;
653 s->s.h.lf_delta.ref[2] = -1;
654 s->s.h.lf_delta.ref[3] = -1;
655 s->s.h.lf_delta.mode[0] = 0;
656 s->s.h.lf_delta.mode[1] = 0;
657 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
659 s->s.h.filter.level = get_bits(&s->gb, 6);
660 sharp = get_bits(&s->gb, 3);
661 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
662 // the old cache values since they are still valid
663 if (s->s.h.filter.sharpness != sharp)
664 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
665 s->s.h.filter.sharpness = sharp;
666 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
667 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
668 for (i = 0; i < 4; i++)
669 if (get_bits1(&s->gb))
670 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
671 for (i = 0; i < 2; i++)
672 if (get_bits1(&s->gb))
673 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
677 /* quantization header data */
678 s->s.h.yac_qi = get_bits(&s->gb, 8);
679 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
680 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
681 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
682 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
683 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
685 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
687 /* segmentation header info */
688 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
689 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
690 for (i = 0; i < 7; i++)
691 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
692 get_bits(&s->gb, 8) : 255;
693 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
694 for (i = 0; i < 3; i++)
695 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
696 get_bits(&s->gb, 8) : 255;
700 if (get_bits1(&s->gb)) {
701 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
702 for (i = 0; i < 8; i++) {
703 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
704 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
705 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
706 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
707 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
708 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
709 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
714 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
715 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
716 int qyac, qydc, quvac, quvdc, lflvl, sh;
718 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
719 if (s->s.h.segmentation.absolute_vals)
720 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
722 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
724 qyac = s->s.h.yac_qi;
726 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
727 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
728 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
729 qyac = av_clip_uintp2(qyac, 8);
731 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
732 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
733 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
734 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
736 sh = s->s.h.filter.level >= 32;
737 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
738 if (s->s.h.segmentation.absolute_vals)
739 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
741 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
743 lflvl = s->s.h.filter.level;
745 if (s->s.h.lf_delta.enabled) {
746 s->s.h.segmentation.feat[i].lflvl[0][0] =
747 s->s.h.segmentation.feat[i].lflvl[0][1] =
748 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
749 for (j = 1; j < 4; j++) {
750 s->s.h.segmentation.feat[i].lflvl[j][0] =
751 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
752 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
753 s->s.h.segmentation.feat[i].lflvl[j][1] =
754 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
755 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
758 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
759 sizeof(s->s.h.segmentation.feat[i].lflvl));
764 if ((res = update_size(ctx, w, h)) < 0) {
765 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
769 for (s->s.h.tiling.log2_tile_cols = 0;
770 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
771 s->s.h.tiling.log2_tile_cols++) ;
772 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
773 max = FFMAX(0, max - 1);
774 while (max > s->s.h.tiling.log2_tile_cols) {
775 if (get_bits1(&s->gb))
776 s->s.h.tiling.log2_tile_cols++;
780 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
781 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
782 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
783 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
784 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
785 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
787 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
788 return AVERROR(ENOMEM);
792 /* check reference frames */
793 if (!s->s.h.keyframe && !s->s.h.intraonly) {
794 for (i = 0; i < 3; i++) {
795 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
796 int refw = ref->width, refh = ref->height;
798 if (ref->format != ctx->pix_fmt) {
799 av_log(ctx, AV_LOG_ERROR,
800 "Ref pixfmt (%s) did not match current frame (%s)",
801 av_get_pix_fmt_name(ref->format),
802 av_get_pix_fmt_name(ctx->pix_fmt));
803 return AVERROR_INVALIDDATA;
804 } else if (refw == w && refh == h) {
805 s->mvscale[i][0] = s->mvscale[i][1] = 0;
807 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
808 av_log(ctx, AV_LOG_ERROR,
809 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
811 return AVERROR_INVALIDDATA;
813 s->mvscale[i][0] = (refw << 14) / w;
814 s->mvscale[i][1] = (refh << 14) / h;
815 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
816 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
821 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
822 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
823 s->prob_ctx[3].p = vp9_default_probs;
824 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
825 sizeof(vp9_default_coef_probs));
826 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
827 sizeof(vp9_default_coef_probs));
828 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
829 sizeof(vp9_default_coef_probs));
830 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
831 sizeof(vp9_default_coef_probs));
832 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
833 s->prob_ctx[c].p = vp9_default_probs;
834 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
835 sizeof(vp9_default_coef_probs));
838 // next 16 bits is size of the rest of the header (arith-coded)
839 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
840 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
842 data2 = align_get_bits(&s->gb);
843 if (size2 > size - (data2 - data)) {
844 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
845 return AVERROR_INVALIDDATA;
847 ff_vp56_init_range_decoder(&s->c, data2, size2);
848 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
849 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
850 return AVERROR_INVALIDDATA;
853 if (s->s.h.keyframe || s->s.h.intraonly) {
854 memset(s->counts.coef, 0, sizeof(s->counts.coef));
855 memset(s->counts.eob, 0, sizeof(s->counts.eob));
857 memset(&s->counts, 0, sizeof(s->counts));
859 // FIXME is it faster to not copy here, but do it down in the fw updates
860 // as explicit copies if the fw update is missing (and skip the copy upon
862 s->prob.p = s->prob_ctx[c].p;
865 if (s->s.h.lossless) {
866 s->s.h.txfmmode = TX_4X4;
868 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
869 if (s->s.h.txfmmode == 3)
870 s->s.h.txfmmode += vp8_rac_get(&s->c);
872 if (s->s.h.txfmmode == TX_SWITCHABLE) {
873 for (i = 0; i < 2; i++)
874 if (vp56_rac_get_prob_branchy(&s->c, 252))
875 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
876 for (i = 0; i < 2; i++)
877 for (j = 0; j < 2; j++)
878 if (vp56_rac_get_prob_branchy(&s->c, 252))
879 s->prob.p.tx16p[i][j] =
880 update_prob(&s->c, s->prob.p.tx16p[i][j]);
881 for (i = 0; i < 2; i++)
882 for (j = 0; j < 3; j++)
883 if (vp56_rac_get_prob_branchy(&s->c, 252))
884 s->prob.p.tx32p[i][j] =
885 update_prob(&s->c, s->prob.p.tx32p[i][j]);
890 for (i = 0; i < 4; i++) {
891 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
892 if (vp8_rac_get(&s->c)) {
893 for (j = 0; j < 2; j++)
894 for (k = 0; k < 2; k++)
895 for (l = 0; l < 6; l++)
896 for (m = 0; m < 6; m++) {
897 uint8_t *p = s->prob.coef[i][j][k][l][m];
898 uint8_t *r = ref[j][k][l][m];
899 if (m >= 3 && l == 0) // dc only has 3 pt
901 for (n = 0; n < 3; n++) {
902 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
903 p[n] = update_prob(&s->c, r[n]);
911 for (j = 0; j < 2; j++)
912 for (k = 0; k < 2; k++)
913 for (l = 0; l < 6; l++)
914 for (m = 0; m < 6; m++) {
915 uint8_t *p = s->prob.coef[i][j][k][l][m];
916 uint8_t *r = ref[j][k][l][m];
917 if (m > 3 && l == 0) // dc only has 3 pt
923 if (s->s.h.txfmmode == i)
928 for (i = 0; i < 3; i++)
929 if (vp56_rac_get_prob_branchy(&s->c, 252))
930 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
931 if (!s->s.h.keyframe && !s->s.h.intraonly) {
932 for (i = 0; i < 7; i++)
933 for (j = 0; j < 3; j++)
934 if (vp56_rac_get_prob_branchy(&s->c, 252))
935 s->prob.p.mv_mode[i][j] =
936 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
938 if (s->s.h.filtermode == FILTER_SWITCHABLE)
939 for (i = 0; i < 4; i++)
940 for (j = 0; j < 2; j++)
941 if (vp56_rac_get_prob_branchy(&s->c, 252))
942 s->prob.p.filter[i][j] =
943 update_prob(&s->c, s->prob.p.filter[i][j]);
945 for (i = 0; i < 4; i++)
946 if (vp56_rac_get_prob_branchy(&s->c, 252))
947 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
949 if (s->s.h.allowcompinter) {
950 s->s.h.comppredmode = vp8_rac_get(&s->c);
951 if (s->s.h.comppredmode)
952 s->s.h.comppredmode += vp8_rac_get(&s->c);
953 if (s->s.h.comppredmode == PRED_SWITCHABLE)
954 for (i = 0; i < 5; i++)
955 if (vp56_rac_get_prob_branchy(&s->c, 252))
957 update_prob(&s->c, s->prob.p.comp[i]);
959 s->s.h.comppredmode = PRED_SINGLEREF;
962 if (s->s.h.comppredmode != PRED_COMPREF) {
963 for (i = 0; i < 5; i++) {
964 if (vp56_rac_get_prob_branchy(&s->c, 252))
965 s->prob.p.single_ref[i][0] =
966 update_prob(&s->c, s->prob.p.single_ref[i][0]);
967 if (vp56_rac_get_prob_branchy(&s->c, 252))
968 s->prob.p.single_ref[i][1] =
969 update_prob(&s->c, s->prob.p.single_ref[i][1]);
973 if (s->s.h.comppredmode != PRED_SINGLEREF) {
974 for (i = 0; i < 5; i++)
975 if (vp56_rac_get_prob_branchy(&s->c, 252))
976 s->prob.p.comp_ref[i] =
977 update_prob(&s->c, s->prob.p.comp_ref[i]);
980 for (i = 0; i < 4; i++)
981 for (j = 0; j < 9; j++)
982 if (vp56_rac_get_prob_branchy(&s->c, 252))
983 s->prob.p.y_mode[i][j] =
984 update_prob(&s->c, s->prob.p.y_mode[i][j]);
986 for (i = 0; i < 4; i++)
987 for (j = 0; j < 4; j++)
988 for (k = 0; k < 3; k++)
989 if (vp56_rac_get_prob_branchy(&s->c, 252))
990 s->prob.p.partition[3 - i][j][k] =
991 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
993 // mv fields don't use the update_prob subexp model for some reason
994 for (i = 0; i < 3; i++)
995 if (vp56_rac_get_prob_branchy(&s->c, 252))
996 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
998 for (i = 0; i < 2; i++) {
999 if (vp56_rac_get_prob_branchy(&s->c, 252))
1000 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1002 for (j = 0; j < 10; j++)
1003 if (vp56_rac_get_prob_branchy(&s->c, 252))
1004 s->prob.p.mv_comp[i].classes[j] =
1005 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1007 if (vp56_rac_get_prob_branchy(&s->c, 252))
1008 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1010 for (j = 0; j < 10; j++)
1011 if (vp56_rac_get_prob_branchy(&s->c, 252))
1012 s->prob.p.mv_comp[i].bits[j] =
1013 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1016 for (i = 0; i < 2; i++) {
1017 for (j = 0; j < 2; j++)
1018 for (k = 0; k < 3; k++)
1019 if (vp56_rac_get_prob_branchy(&s->c, 252))
1020 s->prob.p.mv_comp[i].class0_fp[j][k] =
1021 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1023 for (j = 0; j < 3; j++)
1024 if (vp56_rac_get_prob_branchy(&s->c, 252))
1025 s->prob.p.mv_comp[i].fp[j] =
1026 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1029 if (s->s.h.highprecisionmvs) {
1030 for (i = 0; i < 2; i++) {
1031 if (vp56_rac_get_prob_branchy(&s->c, 252))
1032 s->prob.p.mv_comp[i].class0_hp =
1033 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1035 if (vp56_rac_get_prob_branchy(&s->c, 252))
1036 s->prob.p.mv_comp[i].hp =
1037 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1042 return (data2 - data) + size2;
1045 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1048 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1049 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1052 static void find_ref_mvs(VP9Context *s,
1053 VP56mv *pmv, int ref, int z, int idx, int sb)
1055 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1056 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1057 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1058 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1059 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1060 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1061 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1062 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1063 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1064 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1065 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1066 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1067 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1068 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1069 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1070 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1071 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1072 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1073 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1074 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1075 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1076 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1077 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1078 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1079 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1080 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1081 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1084 int row = s->row, col = s->col, row7 = s->row7;
1085 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1086 #define INVALID_MV 0x80008000U
1087 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1090 #define RETURN_DIRECT_MV(mv) \
1092 uint32_t m = AV_RN32A(&mv); \
1096 } else if (mem == INVALID_MV) { \
1098 } else if (m != mem) { \
1105 if (sb == 2 || sb == 1) {
1106 RETURN_DIRECT_MV(b->mv[0][z]);
1107 } else if (sb == 3) {
1108 RETURN_DIRECT_MV(b->mv[2][z]);
1109 RETURN_DIRECT_MV(b->mv[1][z]);
1110 RETURN_DIRECT_MV(b->mv[0][z]);
1113 #define RETURN_MV(mv) \
1118 av_assert2(idx == 1); \
1119 av_assert2(mem != INVALID_MV); \
1120 if (mem_sub8x8 == INVALID_MV) { \
1121 clamp_mv(&tmp, &mv, s); \
1122 m = AV_RN32A(&tmp); \
1127 mem_sub8x8 = AV_RN32A(&mv); \
1128 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1129 clamp_mv(&tmp, &mv, s); \
1130 m = AV_RN32A(&tmp); \
1134 /* BUG I'm pretty sure this isn't the intention */ \
1140 uint32_t m = AV_RN32A(&mv); \
1142 clamp_mv(pmv, &mv, s); \
1144 } else if (mem == INVALID_MV) { \
1146 } else if (m != mem) { \
1147 clamp_mv(pmv, &mv, s); \
1154 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1155 if (mv->ref[0] == ref) {
1156 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1157 } else if (mv->ref[1] == ref) {
1158 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1161 if (col > s->tile_col_start) {
1162 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1163 if (mv->ref[0] == ref) {
1164 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1165 } else if (mv->ref[1] == ref) {
1166 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1174 // previously coded MVs in this neighbourhood, using same reference frame
1175 for (; i < 8; i++) {
1176 int c = p[i][0] + col, r = p[i][1] + row;
1178 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1179 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1181 if (mv->ref[0] == ref) {
1182 RETURN_MV(mv->mv[0]);
1183 } else if (mv->ref[1] == ref) {
1184 RETURN_MV(mv->mv[1]);
1189 // MV at this position in previous frame, using same reference frame
1190 if (s->s.h.use_last_frame_mvs) {
1191 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1193 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1194 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1195 if (mv->ref[0] == ref) {
1196 RETURN_MV(mv->mv[0]);
1197 } else if (mv->ref[1] == ref) {
1198 RETURN_MV(mv->mv[1]);
1202 #define RETURN_SCALE_MV(mv, scale) \
1205 VP56mv mv_temp = { -mv.x, -mv.y }; \
1206 RETURN_MV(mv_temp); \
1212 // previously coded MVs in this neighbourhood, using different reference frame
1213 for (i = 0; i < 8; i++) {
1214 int c = p[i][0] + col, r = p[i][1] + row;
1216 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1217 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1219 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1220 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1222 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1223 // BUG - libvpx has this condition regardless of whether
1224 // we used the first ref MV and pre-scaling
1225 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1226 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1231 // MV at this position in previous frame, using different reference frame
1232 if (s->s.h.use_last_frame_mvs) {
1233 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1235 // no need to await_progress, because we already did that above
1236 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1237 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1239 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1240 // BUG - libvpx has this condition regardless of whether
1241 // we used the first ref MV and pre-scaling
1242 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1243 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1248 clamp_mv(pmv, pmv, s);
1251 #undef RETURN_SCALE_MV
1254 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1256 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1257 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1258 s->prob.p.mv_comp[idx].classes);
1260 s->counts.mv_comp[idx].sign[sign]++;
1261 s->counts.mv_comp[idx].classes[c]++;
1265 for (n = 0, m = 0; m < c; m++) {
1266 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1268 s->counts.mv_comp[idx].bits[m][bit]++;
1271 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1273 s->counts.mv_comp[idx].fp[bit]++;
1275 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1276 s->counts.mv_comp[idx].hp[bit]++;
1280 // bug in libvpx - we count for bw entropy purposes even if the
1282 s->counts.mv_comp[idx].hp[1]++;
1286 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1287 s->counts.mv_comp[idx].class0[n]++;
1288 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1289 s->prob.p.mv_comp[idx].class0_fp[n]);
1290 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1291 n = (n << 3) | (bit << 1);
1293 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1294 s->counts.mv_comp[idx].class0_hp[bit]++;
1298 // bug in libvpx - we count for bw entropy purposes even if the
1300 s->counts.mv_comp[idx].class0_hp[1]++;
1304 return sign ? -(n + 1) : (n + 1);
1307 static void fill_mv(VP9Context *s,
1308 VP56mv *mv, int mode, int sb)
1312 if (mode == ZEROMV) {
1317 // FIXME cache this value and reuse for other subblocks
1318 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1319 mode == NEWMV ? -1 : sb);
1320 // FIXME maybe move this code into find_ref_mvs()
1321 if ((mode == NEWMV || sb == -1) &&
1322 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1336 if (mode == NEWMV) {
1337 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1338 s->prob.p.mv_joint);
1340 s->counts.mv_joint[j]++;
1341 if (j >= MV_JOINT_V)
1342 mv[0].y += read_mv_component(s, 0, hp);
1344 mv[0].x += read_mv_component(s, 1, hp);
1348 // FIXME cache this value and reuse for other subblocks
1349 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1350 mode == NEWMV ? -1 : sb);
1351 if ((mode == NEWMV || sb == -1) &&
1352 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1366 if (mode == NEWMV) {
1367 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1368 s->prob.p.mv_joint);
1370 s->counts.mv_joint[j]++;
1371 if (j >= MV_JOINT_V)
1372 mv[1].y += read_mv_component(s, 0, hp);
1374 mv[1].x += read_mv_component(s, 1, hp);
1380 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1381 ptrdiff_t stride, int v)
1391 int v16 = v * 0x0101;
1399 uint32_t v32 = v * 0x01010101;
1408 uint64_t v64 = v * 0x0101010101010101ULL;
1414 uint32_t v32 = v * 0x01010101;
1417 AV_WN32A(ptr + 4, v32);
1426 static void decode_mode(AVCodecContext *ctx)
1428 static const uint8_t left_ctx[N_BS_SIZES] = {
1429 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1431 static const uint8_t above_ctx[N_BS_SIZES] = {
1432 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1434 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1435 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1436 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1438 VP9Context *s = ctx->priv_data;
1440 int row = s->row, col = s->col, row7 = s->row7;
1441 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1442 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1443 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1444 int have_a = row > 0, have_l = col > s->tile_col_start;
1445 int vref, filter_id;
1447 if (!s->s.h.segmentation.enabled) {
1449 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1450 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1451 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1452 } else if (!s->s.h.segmentation.update_map ||
1453 (s->s.h.segmentation.temporal &&
1454 vp56_rac_get_prob_branchy(&s->c,
1455 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1456 s->left_segpred_ctx[row7]]))) {
1457 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1459 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1461 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1462 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1463 for (y = 0; y < h4; y++) {
1464 int idx_base = (y + row) * 8 * s->sb_cols + col;
1465 for (x = 0; x < w4; x++)
1466 pred = FFMIN(pred, refsegmap[idx_base + x]);
1468 av_assert1(pred < 8);
1474 memset(&s->above_segpred_ctx[col], 1, w4);
1475 memset(&s->left_segpred_ctx[row7], 1, h4);
1477 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1478 s->s.h.segmentation.prob);
1480 memset(&s->above_segpred_ctx[col], 0, w4);
1481 memset(&s->left_segpred_ctx[row7], 0, h4);
1483 if (s->s.h.segmentation.enabled &&
1484 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1485 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1486 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1489 b->skip = s->s.h.segmentation.enabled &&
1490 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1492 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1493 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1494 s->counts.skip[c][b->skip]++;
1497 if (s->s.h.keyframe || s->s.h.intraonly) {
1499 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1500 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1504 if (have_a && have_l) {
1505 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1508 c = have_a ? 2 * s->above_intra_ctx[col] :
1509 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1511 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1512 s->counts.intra[c][bit]++;
1516 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1520 c = (s->above_skip_ctx[col] ? max_tx :
1521 s->above_txfm_ctx[col]) +
1522 (s->left_skip_ctx[row7] ? max_tx :
1523 s->left_txfm_ctx[row7]) > max_tx;
1525 c = s->above_skip_ctx[col] ? 1 :
1526 (s->above_txfm_ctx[col] * 2 > max_tx);
1528 } else if (have_l) {
1529 c = s->left_skip_ctx[row7] ? 1 :
1530 (s->left_txfm_ctx[row7] * 2 > max_tx);
1536 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1538 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1540 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1542 s->counts.tx32p[c][b->tx]++;
1545 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1547 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1548 s->counts.tx16p[c][b->tx]++;
1551 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1552 s->counts.tx8p[c][b->tx]++;
1559 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1562 if (s->s.h.keyframe || s->s.h.intraonly) {
1563 uint8_t *a = &s->above_mode_ctx[col * 2];
1564 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1567 if (b->bs > BS_8x8) {
1568 // FIXME the memory storage intermediates here aren't really
1569 // necessary, they're just there to make the code slightly
1571 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1572 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1573 if (b->bs != BS_8x4) {
1574 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1575 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1576 l[0] = a[1] = b->mode[1];
1578 l[0] = a[1] = b->mode[1] = b->mode[0];
1580 if (b->bs != BS_4x8) {
1581 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1582 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1583 if (b->bs != BS_8x4) {
1584 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1585 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1586 l[1] = a[1] = b->mode[3];
1588 l[1] = a[1] = b->mode[3] = b->mode[2];
1591 b->mode[2] = b->mode[0];
1592 l[1] = a[1] = b->mode[3] = b->mode[1];
1595 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1596 vp9_default_kf_ymode_probs[*a][*l]);
1597 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1598 // FIXME this can probably be optimized
1599 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1600 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1602 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1603 vp9_default_kf_uvmode_probs[b->mode[3]]);
1604 } else if (b->intra) {
1606 if (b->bs > BS_8x8) {
1607 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1608 s->prob.p.y_mode[0]);
1609 s->counts.y_mode[0][b->mode[0]]++;
1610 if (b->bs != BS_8x4) {
1611 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1612 s->prob.p.y_mode[0]);
1613 s->counts.y_mode[0][b->mode[1]]++;
1615 b->mode[1] = b->mode[0];
1617 if (b->bs != BS_4x8) {
1618 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1619 s->prob.p.y_mode[0]);
1620 s->counts.y_mode[0][b->mode[2]]++;
1621 if (b->bs != BS_8x4) {
1622 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1623 s->prob.p.y_mode[0]);
1624 s->counts.y_mode[0][b->mode[3]]++;
1626 b->mode[3] = b->mode[2];
1629 b->mode[2] = b->mode[0];
1630 b->mode[3] = b->mode[1];
1633 static const uint8_t size_group[10] = {
1634 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1636 int sz = size_group[b->bs];
1638 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1639 s->prob.p.y_mode[sz]);
1640 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1641 s->counts.y_mode[sz][b->mode[3]]++;
1643 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1644 s->prob.p.uv_mode[b->mode[3]]);
1645 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1647 static const uint8_t inter_mode_ctx_lut[14][14] = {
1648 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1649 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1650 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1651 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1652 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1653 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1654 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1655 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1656 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1657 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1658 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1659 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1660 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1661 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1664 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1665 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1667 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1669 // read comp_pred flag
1670 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1671 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1675 // FIXME add intra as ref=0xff (or -1) to make these easier?
1678 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1680 } else if (s->above_comp_ctx[col]) {
1681 c = 2 + (s->left_intra_ctx[row7] ||
1682 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1683 } else if (s->left_comp_ctx[row7]) {
1684 c = 2 + (s->above_intra_ctx[col] ||
1685 s->above_ref_ctx[col] == s->s.h.fixcompref);
1687 c = (!s->above_intra_ctx[col] &&
1688 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1689 (!s->left_intra_ctx[row7] &&
1690 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1693 c = s->above_comp_ctx[col] ? 3 :
1694 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1696 } else if (have_l) {
1697 c = s->left_comp_ctx[row7] ? 3 :
1698 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1702 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1703 s->counts.comp[c][b->comp]++;
1706 // read actual references
1707 // FIXME probably cache a few variables here to prevent repetitive
1708 // memory accesses below
1709 if (b->comp) /* two references */ {
1710 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1712 b->ref[fix_idx] = s->s.h.fixcompref;
1713 // FIXME can this codeblob be replaced by some sort of LUT?
1716 if (s->above_intra_ctx[col]) {
1717 if (s->left_intra_ctx[row7]) {
1720 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1722 } else if (s->left_intra_ctx[row7]) {
1723 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1725 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1727 if (refl == refa && refa == s->s.h.varcompref[1]) {
1729 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1730 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1731 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1734 c = (refa == refl) ? 3 : 1;
1736 } else if (!s->left_comp_ctx[row7]) {
1737 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1740 c = (refl == s->s.h.varcompref[1] &&
1741 refa != s->s.h.varcompref[1]) ? 2 : 4;
1743 } else if (!s->above_comp_ctx[col]) {
1744 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1747 c = (refa == s->s.h.varcompref[1] &&
1748 refl != s->s.h.varcompref[1]) ? 2 : 4;
1751 c = (refl == refa) ? 4 : 2;
1755 if (s->above_intra_ctx[col]) {
1757 } else if (s->above_comp_ctx[col]) {
1758 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1760 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1763 } else if (have_l) {
1764 if (s->left_intra_ctx[row7]) {
1766 } else if (s->left_comp_ctx[row7]) {
1767 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1769 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1774 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1775 b->ref[var_idx] = s->s.h.varcompref[bit];
1776 s->counts.comp_ref[c][bit]++;
1777 } else /* single reference */ {
1780 if (have_a && !s->above_intra_ctx[col]) {
1781 if (have_l && !s->left_intra_ctx[row7]) {
1782 if (s->left_comp_ctx[row7]) {
1783 if (s->above_comp_ctx[col]) {
1784 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1785 !s->above_ref_ctx[col]);
1787 c = (3 * !s->above_ref_ctx[col]) +
1788 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1790 } else if (s->above_comp_ctx[col]) {
1791 c = (3 * !s->left_ref_ctx[row7]) +
1792 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1794 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1796 } else if (s->above_intra_ctx[col]) {
1798 } else if (s->above_comp_ctx[col]) {
1799 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1801 c = 4 * (!s->above_ref_ctx[col]);
1803 } else if (have_l && !s->left_intra_ctx[row7]) {
1804 if (s->left_intra_ctx[row7]) {
1806 } else if (s->left_comp_ctx[row7]) {
1807 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1809 c = 4 * (!s->left_ref_ctx[row7]);
1814 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1815 s->counts.single_ref[c][0][bit]++;
1819 // FIXME can this codeblob be replaced by some sort of LUT?
1822 if (s->left_intra_ctx[row7]) {
1823 if (s->above_intra_ctx[col]) {
1825 } else if (s->above_comp_ctx[col]) {
1826 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1827 s->above_ref_ctx[col] == 1);
1828 } else if (!s->above_ref_ctx[col]) {
1831 c = 4 * (s->above_ref_ctx[col] == 1);
1833 } else if (s->above_intra_ctx[col]) {
1834 if (s->left_intra_ctx[row7]) {
1836 } else if (s->left_comp_ctx[row7]) {
1837 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1838 s->left_ref_ctx[row7] == 1);
1839 } else if (!s->left_ref_ctx[row7]) {
1842 c = 4 * (s->left_ref_ctx[row7] == 1);
1844 } else if (s->above_comp_ctx[col]) {
1845 if (s->left_comp_ctx[row7]) {
1846 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1847 c = 3 * (s->s.h.fixcompref == 1 ||
1848 s->left_ref_ctx[row7] == 1);
1852 } else if (!s->left_ref_ctx[row7]) {
1853 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1854 s->above_ref_ctx[col] == 1);
1856 c = 3 * (s->left_ref_ctx[row7] == 1) +
1857 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1859 } else if (s->left_comp_ctx[row7]) {
1860 if (!s->above_ref_ctx[col]) {
1861 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1862 s->left_ref_ctx[row7] == 1);
1864 c = 3 * (s->above_ref_ctx[col] == 1) +
1865 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1867 } else if (!s->above_ref_ctx[col]) {
1868 if (!s->left_ref_ctx[row7]) {
1871 c = 4 * (s->left_ref_ctx[row7] == 1);
1873 } else if (!s->left_ref_ctx[row7]) {
1874 c = 4 * (s->above_ref_ctx[col] == 1);
1876 c = 2 * (s->left_ref_ctx[row7] == 1) +
1877 2 * (s->above_ref_ctx[col] == 1);
1880 if (s->above_intra_ctx[col] ||
1881 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1883 } else if (s->above_comp_ctx[col]) {
1884 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1886 c = 4 * (s->above_ref_ctx[col] == 1);
1889 } else if (have_l) {
1890 if (s->left_intra_ctx[row7] ||
1891 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1893 } else if (s->left_comp_ctx[row7]) {
1894 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1896 c = 4 * (s->left_ref_ctx[row7] == 1);
1901 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1902 s->counts.single_ref[c][1][bit]++;
1903 b->ref[0] = 1 + bit;
1908 if (b->bs <= BS_8x8) {
1909 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1910 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1912 static const uint8_t off[10] = {
1913 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1916 // FIXME this needs to use the LUT tables from find_ref_mvs
1917 // because not all are -1,0/0,-1
1918 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1919 [s->left_mode_ctx[row7 + off[b->bs]]];
1921 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1922 s->prob.p.mv_mode[c]);
1923 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1924 s->counts.mv_mode[c][b->mode[0] - 10]++;
1928 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1931 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1932 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1933 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1934 s->left_filter_ctx[row7] : 3;
1936 c = s->above_filter_ctx[col];
1938 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1939 c = s->left_filter_ctx[row7];
1944 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1945 s->prob.p.filter[c]);
1946 s->counts.filter[c][filter_id]++;
1947 b->filter = vp9_filter_lut[filter_id];
1949 b->filter = s->s.h.filtermode;
1952 if (b->bs > BS_8x8) {
1953 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1955 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1956 s->prob.p.mv_mode[c]);
1957 s->counts.mv_mode[c][b->mode[0] - 10]++;
1958 fill_mv(s, b->mv[0], b->mode[0], 0);
1960 if (b->bs != BS_8x4) {
1961 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1962 s->prob.p.mv_mode[c]);
1963 s->counts.mv_mode[c][b->mode[1] - 10]++;
1964 fill_mv(s, b->mv[1], b->mode[1], 1);
1966 b->mode[1] = b->mode[0];
1967 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1968 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1971 if (b->bs != BS_4x8) {
1972 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1973 s->prob.p.mv_mode[c]);
1974 s->counts.mv_mode[c][b->mode[2] - 10]++;
1975 fill_mv(s, b->mv[2], b->mode[2], 2);
1977 if (b->bs != BS_8x4) {
1978 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1979 s->prob.p.mv_mode[c]);
1980 s->counts.mv_mode[c][b->mode[3] - 10]++;
1981 fill_mv(s, b->mv[3], b->mode[3], 3);
1983 b->mode[3] = b->mode[2];
1984 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1985 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1988 b->mode[2] = b->mode[0];
1989 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1990 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1991 b->mode[3] = b->mode[1];
1992 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1993 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1996 fill_mv(s, b->mv[0], b->mode[0], -1);
1997 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1998 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1999 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2000 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2001 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2002 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2005 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2009 #define SPLAT_CTX(var, val, n) \
2011 case 1: var = val; break; \
2012 case 2: AV_WN16A(&var, val * 0x0101); break; \
2013 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2014 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2016 uint64_t v64 = val * 0x0101010101010101ULL; \
2017 AV_WN64A( &var, v64); \
2018 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2023 #define SPLAT_CTX(var, val, n) \
2025 case 1: var = val; break; \
2026 case 2: AV_WN16A(&var, val * 0x0101); break; \
2027 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2029 uint32_t v32 = val * 0x01010101; \
2030 AV_WN32A( &var, v32); \
2031 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2035 uint32_t v32 = val * 0x01010101; \
2036 AV_WN32A( &var, v32); \
2037 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2038 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2039 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2045 switch (bwh_tab[1][b->bs][0]) {
2046 #define SET_CTXS(dir, off, n) \
2048 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2049 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2050 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2051 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2052 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2053 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2054 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2056 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2057 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2058 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2063 case 1: SET_CTXS(above, col, 1); break;
2064 case 2: SET_CTXS(above, col, 2); break;
2065 case 4: SET_CTXS(above, col, 4); break;
2066 case 8: SET_CTXS(above, col, 8); break;
2068 switch (bwh_tab[1][b->bs][1]) {
2069 case 1: SET_CTXS(left, row7, 1); break;
2070 case 2: SET_CTXS(left, row7, 2); break;
2071 case 4: SET_CTXS(left, row7, 4); break;
2072 case 8: SET_CTXS(left, row7, 8); break;
2077 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2078 if (b->bs > BS_8x8) {
2079 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2081 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2082 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2083 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2084 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2085 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2086 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2087 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2088 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2090 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2092 for (n = 0; n < w4 * 2; n++) {
2093 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2094 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2096 for (n = 0; n < h4 * 2; n++) {
2097 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2098 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2104 for (y = 0; y < h4; y++) {
2105 int x, o = (row + y) * s->sb_cols * 8 + col;
2106 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2109 for (x = 0; x < w4; x++) {
2113 } else if (b->comp) {
2114 for (x = 0; x < w4; x++) {
2115 mv[x].ref[0] = b->ref[0];
2116 mv[x].ref[1] = b->ref[1];
2117 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2118 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2121 for (x = 0; x < w4; x++) {
2122 mv[x].ref[0] = b->ref[0];
2124 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2130 // FIXME merge cnt/eob arguments?
2131 static av_always_inline int
2132 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2133 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2134 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2135 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2136 const int16_t *band_counts, const int16_t *qmul)
2138 int i = 0, band = 0, band_left = band_counts[band];
2139 uint8_t *tp = p[0][nnz];
2140 uint8_t cache[1024];
2145 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2146 eob[band][nnz][val]++;
2151 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2152 cnt[band][nnz][0]++;
2154 band_left = band_counts[++band];
2156 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2158 if (++i == n_coeffs)
2159 break; //invalid input; blocks should end with EOB
2164 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2165 cnt[band][nnz][1]++;
2169 // fill in p[3-10] (model fill) - only once per frame for each pos
2171 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2173 cnt[band][nnz][2]++;
2174 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2175 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2176 cache[rc] = val = 2;
2178 val = 3 + vp56_rac_get_prob(c, tp[5]);
2181 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2183 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2184 val = 5 + vp56_rac_get_prob(c, 159);
2186 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2187 val += vp56_rac_get_prob(c, 145);
2191 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2192 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2193 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2194 val += (vp56_rac_get_prob(c, 148) << 1);
2195 val += vp56_rac_get_prob(c, 140);
2197 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2198 val += (vp56_rac_get_prob(c, 155) << 2);
2199 val += (vp56_rac_get_prob(c, 140) << 1);
2200 val += vp56_rac_get_prob(c, 135);
2202 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2203 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2204 val += (vp56_rac_get_prob(c, 157) << 3);
2205 val += (vp56_rac_get_prob(c, 141) << 2);
2206 val += (vp56_rac_get_prob(c, 134) << 1);
2207 val += vp56_rac_get_prob(c, 130);
2210 if (!is8bitsperpixel) {
2212 val += vp56_rac_get_prob(c, 255) << 17;
2213 val += vp56_rac_get_prob(c, 255) << 16;
2215 val += (vp56_rac_get_prob(c, 255) << 15);
2216 val += (vp56_rac_get_prob(c, 255) << 14);
2218 val += (vp56_rac_get_prob(c, 254) << 13);
2219 val += (vp56_rac_get_prob(c, 254) << 12);
2220 val += (vp56_rac_get_prob(c, 254) << 11);
2221 val += (vp56_rac_get_prob(c, 252) << 10);
2222 val += (vp56_rac_get_prob(c, 249) << 9);
2223 val += (vp56_rac_get_prob(c, 243) << 8);
2224 val += (vp56_rac_get_prob(c, 230) << 7);
2225 val += (vp56_rac_get_prob(c, 196) << 6);
2226 val += (vp56_rac_get_prob(c, 177) << 5);
2227 val += (vp56_rac_get_prob(c, 153) << 4);
2228 val += (vp56_rac_get_prob(c, 140) << 3);
2229 val += (vp56_rac_get_prob(c, 133) << 2);
2230 val += (vp56_rac_get_prob(c, 130) << 1);
2231 val += vp56_rac_get_prob(c, 129);
2235 #define STORE_COEF(c, i, v) do { \
2236 if (is8bitsperpixel) { \
2239 AV_WN32A(&c[i * 2], v); \
2243 band_left = band_counts[++band];
2245 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2247 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2248 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2250 } while (++i < n_coeffs);
2255 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2256 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2257 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2258 const int16_t (*nb)[2], const int16_t *band_counts,
2259 const int16_t *qmul)
2261 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2262 nnz, scan, nb, band_counts, qmul);
2265 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2266 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2267 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2268 const int16_t (*nb)[2], const int16_t *band_counts,
2269 const int16_t *qmul)
2271 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2272 nnz, scan, nb, band_counts, qmul);
2275 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2276 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2277 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2278 const int16_t (*nb)[2], const int16_t *band_counts,
2279 const int16_t *qmul)
2281 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2282 nnz, scan, nb, band_counts, qmul);
2285 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2286 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2287 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2288 const int16_t (*nb)[2], const int16_t *band_counts,
2289 const int16_t *qmul)
2291 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2292 nnz, scan, nb, band_counts, qmul);
2295 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2297 VP9Context *s = ctx->priv_data;
2299 int row = s->row, col = s->col;
2300 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2301 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2302 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2303 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2304 int end_x = FFMIN(2 * (s->cols - col), w4);
2305 int end_y = FFMIN(2 * (s->rows - row), h4);
2306 int n, pl, x, y, res;
2307 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2308 int tx = 4 * s->s.h.lossless + b->tx;
2309 const int16_t * const *yscans = vp9_scans[tx];
2310 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2311 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2312 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2313 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2314 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2315 static const int16_t band_counts[4][8] = {
2316 { 1, 2, 3, 4, 3, 16 - 13 },
2317 { 1, 2, 3, 4, 11, 64 - 21 },
2318 { 1, 2, 3, 4, 11, 256 - 21 },
2319 { 1, 2, 3, 4, 11, 1024 - 21 },
2321 const int16_t *y_band_counts = band_counts[b->tx];
2322 const int16_t *uv_band_counts = band_counts[b->uvtx];
2323 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2324 int total_coeff = 0;
2326 #define MERGE(la, end, step, rd) \
2327 for (n = 0; n < end; n += step) \
2328 la[n] = !!rd(&la[n])
2329 #define MERGE_CTX(step, rd) \
2331 MERGE(l, end_y, step, rd); \
2332 MERGE(a, end_x, step, rd); \
2335 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2336 for (n = 0, y = 0; y < end_y; y += step) { \
2337 for (x = 0; x < end_x; x += step, n += step * step) { \
2338 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2339 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2340 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2341 c, e, p, a[x] + l[y], yscans[txtp], \
2342 ynbs[txtp], y_band_counts, qmul[0]); \
2343 a[x] = l[y] = !!res; \
2344 total_coeff |= !!res; \
2346 AV_WN16A(&s->eob[n], res); \
2353 #define SPLAT(la, end, step, cond) \
2355 for (n = 1; n < end; n += step) \
2356 la[n] = la[n - 1]; \
2357 } else if (step == 4) { \
2359 for (n = 0; n < end; n += step) \
2360 AV_WN32A(&la[n], la[n] * 0x01010101); \
2362 for (n = 0; n < end; n += step) \
2363 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2365 } else /* step == 8 */ { \
2367 if (HAVE_FAST_64BIT) { \
2368 for (n = 0; n < end; n += step) \
2369 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2371 for (n = 0; n < end; n += step) { \
2372 uint32_t v32 = la[n] * 0x01010101; \
2373 AV_WN32A(&la[n], v32); \
2374 AV_WN32A(&la[n + 4], v32); \
2378 for (n = 0; n < end; n += step) \
2379 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2382 #define SPLAT_CTX(step) \
2384 SPLAT(a, end_x, step, end_x == w4); \
2385 SPLAT(l, end_y, step, end_y == h4); \
2391 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2394 MERGE_CTX(2, AV_RN16A);
2395 DECODE_Y_COEF_LOOP(2, 0,);
2399 MERGE_CTX(4, AV_RN32A);
2400 DECODE_Y_COEF_LOOP(4, 0,);
2404 MERGE_CTX(8, AV_RN64A);
2405 DECODE_Y_COEF_LOOP(8, 0, 32);
2410 #define DECODE_UV_COEF_LOOP(step, v) \
2411 for (n = 0, y = 0; y < end_y; y += step) { \
2412 for (x = 0; x < end_x; x += step, n += step * step) { \
2413 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2414 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2415 16 * step * step, c, e, p, a[x] + l[y], \
2416 uvscan, uvnb, uv_band_counts, qmul[1]); \
2417 a[x] = l[y] = !!res; \
2418 total_coeff |= !!res; \
2420 AV_WN16A(&s->uveob[pl][n], res); \
2422 s->uveob[pl][n] = res; \
2427 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2428 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2429 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2434 for (pl = 0; pl < 2; pl++) {
2435 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2436 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2439 DECODE_UV_COEF_LOOP(1,);
2442 MERGE_CTX(2, AV_RN16A);
2443 DECODE_UV_COEF_LOOP(2,);
2447 MERGE_CTX(4, AV_RN32A);
2448 DECODE_UV_COEF_LOOP(4,);
2452 MERGE_CTX(8, AV_RN64A);
2453 DECODE_UV_COEF_LOOP(8, 32);
2462 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2464 return decode_coeffs(ctx, 1);
2467 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2469 return decode_coeffs(ctx, 0);
2472 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2473 uint8_t *dst_edge, ptrdiff_t stride_edge,
2474 uint8_t *dst_inner, ptrdiff_t stride_inner,
2475 uint8_t *l, int col, int x, int w,
2476 int row, int y, enum TxfmMode tx,
2477 int p, int ss_h, int ss_v, int bytesperpixel)
2479 int have_top = row > 0 || y > 0;
2480 int have_left = col > s->tile_col_start || x > 0;
2481 int have_right = x < w - 1;
2483 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2484 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2485 { DC_127_PRED, VERT_PRED } },
2486 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2487 { HOR_PRED, HOR_PRED } },
2488 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2489 { LEFT_DC_PRED, DC_PRED } },
2490 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2491 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2492 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2493 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2494 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2495 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2496 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2497 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2498 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2499 { DC_127_PRED, VERT_LEFT_PRED } },
2500 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2501 { HOR_UP_PRED, HOR_UP_PRED } },
2502 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2503 { HOR_PRED, TM_VP8_PRED } },
2505 static const struct {
2506 uint8_t needs_left:1;
2507 uint8_t needs_top:1;
2508 uint8_t needs_topleft:1;
2509 uint8_t needs_topright:1;
2510 uint8_t invert_left:1;
2511 } edges[N_INTRA_PRED_MODES] = {
2512 [VERT_PRED] = { .needs_top = 1 },
2513 [HOR_PRED] = { .needs_left = 1 },
2514 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2515 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2516 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2517 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2518 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2519 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2520 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2521 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2522 [LEFT_DC_PRED] = { .needs_left = 1 },
2523 [TOP_DC_PRED] = { .needs_top = 1 },
2524 [DC_128_PRED] = { 0 },
2525 [DC_127_PRED] = { 0 },
2526 [DC_129_PRED] = { 0 }
2529 av_assert2(mode >= 0 && mode < 10);
2530 mode = mode_conv[mode][have_left][have_top];
2531 if (edges[mode].needs_top) {
2532 uint8_t *top, *topleft;
2533 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2534 int n_px_need_tr = 0;
2536 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2539 // if top of sb64-row, use s->intra_pred_data[] instead of
2540 // dst[-stride] for intra prediction (it contains pre- instead of
2541 // post-loopfilter data)
2543 top = !(row & 7) && !y ?
2544 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2545 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2547 topleft = !(row & 7) && !y ?
2548 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2549 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2550 &dst_inner[-stride_inner];
2554 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2555 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2556 n_px_need + n_px_need_tr <= n_px_have) {
2560 if (n_px_need <= n_px_have) {
2561 memcpy(*a, top, n_px_need * bytesperpixel);
2563 #define memset_bpp(c, i1, v, i2, num) do { \
2564 if (bytesperpixel == 1) { \
2565 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2567 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2568 for (n = 0; n < (num); n++) { \
2569 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2573 memcpy(*a, top, n_px_have * bytesperpixel);
2574 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2577 #define memset_val(c, val, num) do { \
2578 if (bytesperpixel == 1) { \
2579 memset((c), (val), (num)); \
2582 for (n = 0; n < (num); n++) { \
2583 AV_WN16A(&(c)[n * 2], (val)); \
2587 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2589 if (edges[mode].needs_topleft) {
2590 if (have_left && have_top) {
2591 #define assign_bpp(c, i1, v, i2) do { \
2592 if (bytesperpixel == 1) { \
2593 (c)[(i1)] = (v)[(i2)]; \
2595 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2598 assign_bpp(*a, -1, topleft, -1);
2600 #define assign_val(c, i, v) do { \
2601 if (bytesperpixel == 1) { \
2604 AV_WN16A(&(c)[(i) * 2], (v)); \
2607 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2610 if (tx == TX_4X4 && edges[mode].needs_topright) {
2611 if (have_top && have_right &&
2612 n_px_need + n_px_need_tr <= n_px_have) {
2613 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2615 memset_bpp(*a, 4, *a, 3, 4);
2620 if (edges[mode].needs_left) {
2622 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2623 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2624 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2626 if (edges[mode].invert_left) {
2627 if (n_px_need <= n_px_have) {
2628 for (i = 0; i < n_px_need; i++)
2629 assign_bpp(l, i, &dst[i * stride], -1);
2631 for (i = 0; i < n_px_have; i++)
2632 assign_bpp(l, i, &dst[i * stride], -1);
2633 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2636 if (n_px_need <= n_px_have) {
2637 for (i = 0; i < n_px_need; i++)
2638 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2640 for (i = 0; i < n_px_have; i++)
2641 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2642 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2646 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2653 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2654 ptrdiff_t uv_off, int bytesperpixel)
2656 VP9Context *s = ctx->priv_data;
2658 int row = s->row, col = s->col;
2659 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2660 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2661 int end_x = FFMIN(2 * (s->cols - col), w4);
2662 int end_y = FFMIN(2 * (s->rows - row), h4);
2663 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2664 int uvstep1d = 1 << b->uvtx, p;
2665 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2666 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2667 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2669 for (n = 0, y = 0; y < end_y; y += step1d) {
2670 uint8_t *ptr = dst, *ptr_r = dst_r;
2671 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2672 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2673 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2675 uint8_t *a = &a_buf[32];
2676 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2677 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2679 mode = check_intra_mode(s, mode, &a, ptr_r,
2680 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2681 ptr, s->y_stride, l,
2682 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2683 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2685 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2686 s->block + 16 * n * bytesperpixel, eob);
2688 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2689 dst += 4 * step1d * s->y_stride;
2696 step = 1 << (b->uvtx * 2);
2697 for (p = 0; p < 2; p++) {
2698 dst = s->dst[1 + p];
2699 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2700 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2701 uint8_t *ptr = dst, *ptr_r = dst_r;
2702 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2703 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2704 int mode = b->uvmode;
2705 uint8_t *a = &a_buf[32];
2706 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2708 mode = check_intra_mode(s, mode, &a, ptr_r,
2709 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2710 ptr, s->uv_stride, l, col, x, w4, row, y,
2711 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2712 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2714 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2715 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2717 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2718 dst += 4 * uvstep1d * s->uv_stride;
2723 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2725 intra_recon(ctx, y_off, uv_off, 1);
2728 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2730 intra_recon(ctx, y_off, uv_off, 2);
2733 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2734 uint8_t *dst, ptrdiff_t dst_stride,
2735 const uint8_t *ref, ptrdiff_t ref_stride,
2736 ThreadFrame *ref_frame,
2737 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2738 int bw, int bh, int w, int h, int bytesperpixel)
2740 int mx = mv->x, my = mv->y, th;
2744 ref += y * ref_stride + x * bytesperpixel;
2747 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2748 // we use +7 because the last 7 pixels of each sbrow can be changed in
2749 // the longest loopfilter of the next sbrow
2750 th = (y + bh + 4 * !!my + 7) >> 6;
2751 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2752 if (x < !!mx * 3 || y < !!my * 3 ||
2753 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2754 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2755 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2757 bw + !!mx * 7, bh + !!my * 7,
2758 x - !!mx * 3, y - !!my * 3, w, h);
2759 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2762 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2765 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2766 uint8_t *dst_u, uint8_t *dst_v,
2767 ptrdiff_t dst_stride,
2768 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2769 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2770 ThreadFrame *ref_frame,
2771 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2772 int bw, int bh, int w, int h, int bytesperpixel)
2774 int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
2778 ref_u += y * src_stride_u + x * bytesperpixel;
2779 ref_v += y * src_stride_v + x * bytesperpixel;
2782 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2783 // we use +7 because the last 7 pixels of each sbrow can be changed in
2784 // the longest loopfilter of the next sbrow
2785 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2786 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2787 if (x < !!mx * 3 || y < !!my * 3 ||
2788 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2789 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2790 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2792 bw + !!mx * 7, bh + !!my * 7,
2793 x - !!mx * 3, y - !!my * 3, w, h);
2794 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2795 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2797 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2798 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2800 bw + !!mx * 7, bh + !!my * 7,
2801 x - !!mx * 3, y - !!my * 3, w, h);
2802 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2803 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2805 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2806 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2810 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2811 px, py, pw, ph, bw, bh, w, h, i) \
2812 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2813 mv, bw, bh, w, h, bytesperpixel)
2814 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2815 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2816 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2817 row, col, mv, bw, bh, w, h, bytesperpixel)
2819 #define FN(x) x##_8bpp
2820 #define BYTES_PER_PIXEL 1
2821 #include "vp9_mc_template.c"
2823 #undef BYTES_PER_PIXEL
2824 #define FN(x) x##_16bpp
2825 #define BYTES_PER_PIXEL 2
2826 #include "vp9_mc_template.c"
2828 #undef mc_chroma_dir
2830 #undef BYTES_PER_PIXEL
2833 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2834 vp9_mc_func (*mc)[2],
2835 uint8_t *dst, ptrdiff_t dst_stride,
2836 const uint8_t *ref, ptrdiff_t ref_stride,
2837 ThreadFrame *ref_frame,
2838 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2839 int px, int py, int pw, int ph,
2840 int bw, int bh, int w, int h, int bytesperpixel,
2841 const uint16_t *scale, const uint8_t *step)
2843 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2844 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2845 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2846 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2848 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2850 int refbw_m1, refbh_m1;
2854 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2855 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2856 // BUG libvpx seems to scale the two components separately. This introduces
2857 // rounding errors but we have to reproduce them to be exactly compatible
2858 // with the output from libvpx...
2859 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2860 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2864 ref += y * ref_stride + x * bytesperpixel;
2867 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2868 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2869 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2870 // we use +7 because the last 7 pixels of each sbrow can be changed in
2871 // the longest loopfilter of the next sbrow
2872 th = (y + refbh_m1 + 4 + 7) >> 6;
2873 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2874 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2875 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2876 ref - 3 * ref_stride - 3 * bytesperpixel,
2878 refbw_m1 + 8, refbh_m1 + 8,
2879 x - 3, y - 3, w, h);
2880 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2883 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2887 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2888 vp9_mc_func (*mc)[2],
2889 uint8_t *dst_u, uint8_t *dst_v,
2890 ptrdiff_t dst_stride,
2891 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2892 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2893 ThreadFrame *ref_frame,
2894 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2895 int px, int py, int pw, int ph,
2896 int bw, int bh, int w, int h, int bytesperpixel,
2897 const uint16_t *scale, const uint8_t *step)
2899 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2900 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2901 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2902 ref_v, src_stride_v, ref_frame,
2903 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2906 int refbw_m1, refbh_m1;
2911 // BUG https://code.google.com/p/webm/issues/detail?id=820
2912 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
2913 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2915 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2916 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2919 // BUG https://code.google.com/p/webm/issues/detail?id=820
2920 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
2921 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2923 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2924 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2929 ref_u += y * src_stride_u + x * bytesperpixel;
2930 ref_v += y * src_stride_v + x * bytesperpixel;
2933 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2934 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2935 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2936 // we use +7 because the last 7 pixels of each sbrow can be changed in
2937 // the longest loopfilter of the next sbrow
2938 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2939 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2940 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2941 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2942 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2944 refbw_m1 + 8, refbh_m1 + 8,
2945 x - 3, y - 3, w, h);
2946 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2947 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2949 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2950 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2952 refbw_m1 + 8, refbh_m1 + 8,
2953 x - 3, y - 3, w, h);
2954 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2955 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2957 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2958 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2963 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2964 px, py, pw, ph, bw, bh, w, h, i) \
2965 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2966 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2967 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2968 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2969 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2970 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2971 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2972 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2974 #define FN(x) x##_scaled_8bpp
2975 #define BYTES_PER_PIXEL 1
2976 #include "vp9_mc_template.c"
2978 #undef BYTES_PER_PIXEL
2979 #define FN(x) x##_scaled_16bpp
2980 #define BYTES_PER_PIXEL 2
2981 #include "vp9_mc_template.c"
2983 #undef mc_chroma_dir
2985 #undef BYTES_PER_PIXEL
2988 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2990 VP9Context *s = ctx->priv_data;
2992 int row = s->row, col = s->col;
2994 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2995 if (bytesperpixel == 1) {
2996 inter_pred_scaled_8bpp(ctx);
2998 inter_pred_scaled_16bpp(ctx);
3001 if (bytesperpixel == 1) {
3002 inter_pred_8bpp(ctx);
3004 inter_pred_16bpp(ctx);
3008 /* mostly copied intra_recon() */
3010 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3011 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3012 int end_x = FFMIN(2 * (s->cols - col), w4);
3013 int end_y = FFMIN(2 * (s->rows - row), h4);
3014 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3015 int uvstep1d = 1 << b->uvtx, p;
3016 uint8_t *dst = s->dst[0];
3019 for (n = 0, y = 0; y < end_y; y += step1d) {
3021 for (x = 0; x < end_x; x += step1d,
3022 ptr += 4 * step1d * bytesperpixel, n += step) {
3023 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3026 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3027 s->block + 16 * n * bytesperpixel, eob);
3029 dst += 4 * s->y_stride * step1d;
3035 step = 1 << (b->uvtx * 2);
3036 for (p = 0; p < 2; p++) {
3037 dst = s->dst[p + 1];
3038 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3040 for (x = 0; x < end_x; x += uvstep1d,
3041 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3042 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3045 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3046 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3048 dst += 4 * uvstep1d * s->uv_stride;
3054 static void inter_recon_8bpp(AVCodecContext *ctx)
3056 inter_recon(ctx, 1);
3059 static void inter_recon_16bpp(AVCodecContext *ctx)
3061 inter_recon(ctx, 2);
3064 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3065 int row_and_7, int col_and_7,
3066 int w, int h, int col_end, int row_end,
3067 enum TxfmMode tx, int skip_inter)
3069 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3070 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3072 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3073 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3074 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3075 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3077 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3078 // edges. This means that for UV, we work on two subsampled blocks at
3079 // a time, and we only use the topleft block's mode information to set
3080 // things like block strength. Thus, for any block size smaller than
3081 // 16x16, ignore the odd portion of the block.
3082 if (tx == TX_4X4 && (ss_v | ss_h)) {
3097 if (tx == TX_4X4 && !skip_inter) {
3098 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3099 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3100 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3102 for (y = row_and_7; y < h + row_and_7; y++) {
3103 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3105 mask[0][y][1] |= m_row_8;
3106 mask[0][y][2] |= m_row_4;
3107 // for odd lines, if the odd col is not being filtered,
3108 // skip odd row also:
3115 // if a/c are even row/col and b/d are odd, and d is skipped,
3116 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3117 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3118 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3120 mask[1][y][col_mask_id] |= m_col;
3123 mask[0][y][3] |= m_col;
3125 if (ss_h && (col_end & 1))
3126 mask[1][y][3] |= (t << (w - 1)) - t;
3128 mask[1][y][3] |= m_col;
3132 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3135 int mask_id = (tx == TX_8X8);
3136 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3137 int l2 = tx + ss_h - 1, step1d;
3138 int m_row = m_col & masks[l2];
3140 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3141 // 8wd loopfilter to prevent going off the visible edge.
3142 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3143 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3144 int m_row_8 = m_row - m_row_16;
3146 for (y = row_and_7; y < h + row_and_7; y++) {
3147 mask[0][y][0] |= m_row_16;
3148 mask[0][y][1] |= m_row_8;
3151 for (y = row_and_7; y < h + row_and_7; y++)
3152 mask[0][y][mask_id] |= m_row;
3157 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3158 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3159 mask[1][y][0] |= m_col;
3160 if (y - row_and_7 == h - 1)
3161 mask[1][y][1] |= m_col;
3163 for (y = row_and_7; y < h + row_and_7; y += step1d)
3164 mask[1][y][mask_id] |= m_col;
3166 } else if (tx != TX_4X4) {
3169 mask_id = (tx == TX_8X8) || (h == ss_v);
3170 mask[1][row_and_7][mask_id] |= m_col;
3171 mask_id = (tx == TX_8X8) || (w == ss_h);
3172 for (y = row_and_7; y < h + row_and_7; y++)
3173 mask[0][y][mask_id] |= t;
3175 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3177 for (y = row_and_7; y < h + row_and_7; y++) {
3178 mask[0][y][2] |= t4;
3179 mask[0][y][1] |= t8;
3181 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3186 static void decode_b(AVCodecContext *ctx, int row, int col,
3187 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3188 enum BlockLevel bl, enum BlockPartition bp)
3190 VP9Context *s = ctx->priv_data;
3192 enum BlockSize bs = bl * 3 + bp;
3193 int bytesperpixel = s->bytesperpixel;
3194 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3196 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3202 s->min_mv.x = -(128 + col * 64);
3203 s->min_mv.y = -(128 + row * 64);
3204 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3205 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3211 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3212 (s->ss_v && h4 * 2 == (1 << b->tx)));
3217 if (bytesperpixel == 1) {
3218 has_coeffs = decode_coeffs_8bpp(ctx);
3220 has_coeffs = decode_coeffs_16bpp(ctx);
3222 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3224 memset(&s->above_skip_ctx[col], 1, w4);
3225 memset(&s->left_skip_ctx[s->row7], 1, h4);
3230 #define SPLAT_ZERO_CTX(v, n) \
3232 case 1: v = 0; break; \
3233 case 2: AV_ZERO16(&v); break; \
3234 case 4: AV_ZERO32(&v); break; \
3235 case 8: AV_ZERO64(&v); break; \
3236 case 16: AV_ZERO128(&v); break; \
3238 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3240 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3241 if (s->ss_##dir2) { \
3242 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3243 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3245 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3246 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3251 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3252 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3253 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3254 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3257 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3258 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3259 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3260 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3266 s->block += w4 * h4 * 64 * bytesperpixel;
3267 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3268 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3269 s->eob += 4 * w4 * h4;
3270 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3271 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3277 // emulated overhangs if the stride of the target buffer can't hold. This
3278 // makes it possible to support emu-edge and so on even if we have large block
3280 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3281 (row + h4) > s->rows;
3282 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3283 (row + h4) > s->rows;
3285 s->dst[0] = s->tmp_y;
3288 s->dst[0] = f->data[0] + yoff;
3289 s->y_stride = f->linesize[0];
3292 s->dst[1] = s->tmp_uv[0];
3293 s->dst[2] = s->tmp_uv[1];
3296 s->dst[1] = f->data[1] + uvoff;
3297 s->dst[2] = f->data[2] + uvoff;
3298 s->uv_stride = f->linesize[1];
3302 intra_recon_16bpp(ctx, yoff, uvoff);
3304 intra_recon_8bpp(ctx, yoff, uvoff);
3308 inter_recon_16bpp(ctx);
3310 inter_recon_8bpp(ctx);
3314 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3316 for (n = 0; o < w; n++) {
3321 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3322 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3328 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3329 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3331 for (n = s->ss_h; o < w; n++) {
3336 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3337 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3338 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3339 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3345 // pick filter level and find edges to apply filter to
3346 if (s->s.h.filter.level &&
3347 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3348 [b->mode[3] != ZEROMV]) > 0) {
3349 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3350 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3352 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3353 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3354 if (s->ss_h || s->ss_v)
3355 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3356 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3357 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3358 b->uvtx, skip_inter);
3360 if (!s->filter_lut.lim_lut[lvl]) {
3361 int sharp = s->s.h.filter.sharpness;
3365 limit >>= (sharp + 3) >> 2;
3366 limit = FFMIN(limit, 9 - sharp);
3368 limit = FFMAX(limit, 1);
3370 s->filter_lut.lim_lut[lvl] = limit;
3371 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3377 s->block += w4 * h4 * 64 * bytesperpixel;
3378 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3379 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3380 s->eob += 4 * w4 * h4;
3381 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3382 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3386 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3387 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3389 VP9Context *s = ctx->priv_data;
3390 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3391 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3392 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3393 s->prob.p.partition[bl][c];
3394 enum BlockPartition bp;
3395 ptrdiff_t hbs = 4 >> bl;
3396 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3397 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3398 int bytesperpixel = s->bytesperpixel;
3401 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3402 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3403 } else if (col + hbs < s->cols) { // FIXME why not <=?
3404 if (row + hbs < s->rows) { // FIXME why not <=?
3405 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3407 case PARTITION_NONE:
3408 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3411 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3412 yoff += hbs * 8 * y_stride;
3413 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3414 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3417 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3418 yoff += hbs * 8 * bytesperpixel;
3419 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3420 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3422 case PARTITION_SPLIT:
3423 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3424 decode_sb(ctx, row, col + hbs, lflvl,
3425 yoff + 8 * hbs * bytesperpixel,
3426 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3427 yoff += hbs * 8 * y_stride;
3428 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3429 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3430 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3431 yoff + 8 * hbs * bytesperpixel,
3432 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3437 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3438 bp = PARTITION_SPLIT;
3439 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3440 decode_sb(ctx, row, col + hbs, lflvl,
3441 yoff + 8 * hbs * bytesperpixel,
3442 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3445 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3447 } else if (row + hbs < s->rows) { // FIXME why not <=?
3448 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3449 bp = PARTITION_SPLIT;
3450 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3451 yoff += hbs * 8 * y_stride;
3452 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3453 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3456 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3459 bp = PARTITION_SPLIT;
3460 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3462 s->counts.partition[bl][c][bp]++;
3465 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3466 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3468 VP9Context *s = ctx->priv_data;
3470 ptrdiff_t hbs = 4 >> bl;
3471 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3472 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3473 int bytesperpixel = s->bytesperpixel;
3476 av_assert2(b->bl == BL_8X8);
3477 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3478 } else if (s->b->bl == bl) {
3479 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3480 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3481 yoff += hbs * 8 * y_stride;
3482 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3483 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3484 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3485 yoff += hbs * 8 * bytesperpixel;
3486 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3487 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3490 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3491 if (col + hbs < s->cols) { // FIXME why not <=?
3492 if (row + hbs < s->rows) {
3493 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3494 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3495 yoff += hbs * 8 * y_stride;
3496 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3497 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3498 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3499 yoff + 8 * hbs * bytesperpixel,
3500 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3502 yoff += hbs * 8 * bytesperpixel;
3503 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3504 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3506 } else if (row + hbs < s->rows) {
3507 yoff += hbs * 8 * y_stride;
3508 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3509 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3514 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3515 uint8_t *lvl, uint8_t (*mask)[4],
3516 uint8_t *dst, ptrdiff_t ls)
3518 int y, x, bytesperpixel = s->bytesperpixel;
3520 // filter edges between columns (e.g. block1 | block2)
3521 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3522 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3523 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3524 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3525 unsigned hm = hm1 | hm2 | hm13 | hm23;
3527 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3530 int L = *l, H = L >> 4;
3531 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3533 if (hmask1[0] & x) {
3534 if (hmask2[0] & x) {
3535 av_assert2(l[8 << ss_v] == L);
3536 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3538 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3540 } else if (hm2 & x) {
3543 E |= s->filter_lut.mblim_lut[L] << 8;
3544 I |= s->filter_lut.lim_lut[L] << 8;
3545 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3547 [0](ptr, ls, E, I, H);
3549 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3550 [0](ptr, ls, E, I, H);
3552 } else if (hm2 & x) {
3553 int L = l[8 << ss_v], H = L >> 4;
3554 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3556 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3557 [0](ptr + 8 * ls, ls, E, I, H);
3565 int L = *l, H = L >> 4;
3566 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3571 E |= s->filter_lut.mblim_lut[L] << 8;
3572 I |= s->filter_lut.lim_lut[L] << 8;
3573 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3575 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3577 } else if (hm23 & x) {
3578 int L = l[8 << ss_v], H = L >> 4;
3579 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3581 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3589 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3590 uint8_t *lvl, uint8_t (*mask)[4],
3591 uint8_t *dst, ptrdiff_t ls)
3593 int y, x, bytesperpixel = s->bytesperpixel;
3596 // filter edges between rows (e.g. ------)
3598 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3599 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3600 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3602 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3605 int L = *l, H = L >> 4;
3606 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3609 if (vmask[0] & (x << (1 + ss_h))) {
3610 av_assert2(l[1 + ss_h] == L);
3611 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3613 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3615 } else if (vm & (x << (1 + ss_h))) {
3618 E |= s->filter_lut.mblim_lut[L] << 8;
3619 I |= s->filter_lut.lim_lut[L] << 8;
3620 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3621 [!!(vmask[1] & (x << (1 + ss_h)))]
3622 [1](ptr, ls, E, I, H);
3624 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3625 [1](ptr, ls, E, I, H);
3627 } else if (vm & (x << (1 + ss_h))) {
3628 int L = l[1 + ss_h], H = L >> 4;
3629 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3631 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3632 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3637 int L = *l, H = L >> 4;
3638 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3640 if (vm3 & (x << (1 + ss_h))) {
3643 E |= s->filter_lut.mblim_lut[L] << 8;
3644 I |= s->filter_lut.lim_lut[L] << 8;
3645 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3647 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3649 } else if (vm3 & (x << (1 + ss_h))) {
3650 int L = l[1 + ss_h], H = L >> 4;
3651 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3653 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3666 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3667 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3669 VP9Context *s = ctx->priv_data;
3670 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3671 uint8_t *dst = f->data[0] + yoff;
3672 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3673 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3676 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3677 // if you think of them as acting on a 8x8 block max, we can interleave
3678 // each v/h within the single x loop, but that only works if we work on
3679 // 8 pixel blocks, and we won't always do that (we want at least 16px
3680 // to use SSE2 optimizations, perhaps 32 for AVX2)
3682 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3683 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3685 for (p = 0; p < 2; p++) {
3686 dst = f->data[1 + p] + uvoff;
3687 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3688 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3692 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3694 int sb_start = ( idx * n) >> log2_n;
3695 int sb_end = ((idx + 1) * n) >> log2_n;
3696 *start = FFMIN(sb_start, n) << 3;
3697 *end = FFMIN(sb_end, n) << 3;
3700 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3701 int max_count, int update_factor)
3703 unsigned ct = ct0 + ct1, p2, p1;
3709 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3710 p2 = av_clip(p2, 1, 255);
3711 ct = FFMIN(ct, max_count);
3712 update_factor = FASTDIV(update_factor * ct, max_count);
3714 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3715 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3718 static void adapt_probs(VP9Context *s)
3721 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3722 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3725 for (i = 0; i < 4; i++)
3726 for (j = 0; j < 2; j++)
3727 for (k = 0; k < 2; k++)
3728 for (l = 0; l < 6; l++)
3729 for (m = 0; m < 6; m++) {
3730 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3731 unsigned *e = s->counts.eob[i][j][k][l][m];
3732 unsigned *c = s->counts.coef[i][j][k][l][m];
3734 if (l == 0 && m >= 3) // dc only has 3 pt
3737 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3738 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3739 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3742 if (s->s.h.keyframe || s->s.h.intraonly) {
3743 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3744 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3745 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3746 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3751 for (i = 0; i < 3; i++)
3752 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3755 for (i = 0; i < 4; i++)
3756 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3759 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3760 for (i = 0; i < 5; i++)
3761 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3765 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3766 for (i = 0; i < 5; i++)
3767 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3768 s->counts.comp_ref[i][1], 20, 128);
3771 if (s->s.h.comppredmode != PRED_COMPREF) {
3772 for (i = 0; i < 5; i++) {
3773 uint8_t *pp = p->single_ref[i];
3774 unsigned (*c)[2] = s->counts.single_ref[i];
3776 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3777 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3781 // block partitioning
3782 for (i = 0; i < 4; i++)
3783 for (j = 0; j < 4; j++) {
3784 uint8_t *pp = p->partition[i][j];
3785 unsigned *c = s->counts.partition[i][j];
3787 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3788 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3789 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3793 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3794 for (i = 0; i < 2; i++) {
3795 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3797 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3798 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3799 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3800 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3801 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3802 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3806 // interpolation filter
3807 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3808 for (i = 0; i < 4; i++) {
3809 uint8_t *pp = p->filter[i];
3810 unsigned *c = s->counts.filter[i];
3812 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3813 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3818 for (i = 0; i < 7; i++) {
3819 uint8_t *pp = p->mv_mode[i];
3820 unsigned *c = s->counts.mv_mode[i];
3822 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3823 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3824 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3829 uint8_t *pp = p->mv_joint;
3830 unsigned *c = s->counts.mv_joint;
3832 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3833 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3834 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3838 for (i = 0; i < 2; i++) {
3840 unsigned *c, (*c2)[2], sum;
3842 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3843 s->counts.mv_comp[i].sign[1], 20, 128);
3845 pp = p->mv_comp[i].classes;
3846 c = s->counts.mv_comp[i].classes;
3847 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3848 adapt_prob(&pp[0], c[0], sum, 20, 128);
3850 adapt_prob(&pp[1], c[1], sum, 20, 128);
3852 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3853 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3855 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3856 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3858 adapt_prob(&pp[6], c[6], sum, 20, 128);
3859 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3860 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3861 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3863 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3864 s->counts.mv_comp[i].class0[1], 20, 128);
3865 pp = p->mv_comp[i].bits;
3866 c2 = s->counts.mv_comp[i].bits;
3867 for (j = 0; j < 10; j++)
3868 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3870 for (j = 0; j < 2; j++) {
3871 pp = p->mv_comp[i].class0_fp[j];
3872 c = s->counts.mv_comp[i].class0_fp[j];
3873 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3874 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3875 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3877 pp = p->mv_comp[i].fp;
3878 c = s->counts.mv_comp[i].fp;
3879 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3880 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3881 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3883 if (s->s.h.highprecisionmvs) {
3884 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3885 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3886 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3887 s->counts.mv_comp[i].hp[1], 20, 128);
3892 for (i = 0; i < 4; i++) {
3893 uint8_t *pp = p->y_mode[i];
3894 unsigned *c = s->counts.y_mode[i], sum, s2;
3896 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3897 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3898 sum -= c[TM_VP8_PRED];
3899 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3900 sum -= c[VERT_PRED];
3901 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3902 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3904 adapt_prob(&pp[3], s2, sum, 20, 128);
3906 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3907 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3908 sum -= c[DIAG_DOWN_LEFT_PRED];
3909 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3910 sum -= c[VERT_LEFT_PRED];
3911 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3912 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3916 for (i = 0; i < 10; i++) {
3917 uint8_t *pp = p->uv_mode[i];
3918 unsigned *c = s->counts.uv_mode[i], sum, s2;
3920 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3921 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3922 sum -= c[TM_VP8_PRED];
3923 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3924 sum -= c[VERT_PRED];
3925 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3926 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3928 adapt_prob(&pp[3], s2, sum, 20, 128);
3930 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3931 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3932 sum -= c[DIAG_DOWN_LEFT_PRED];
3933 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3934 sum -= c[VERT_LEFT_PRED];
3935 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3936 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3940 static void free_buffers(VP9Context *s)
3942 av_freep(&s->intra_pred_data[0]);
3943 av_freep(&s->b_base);
3944 av_freep(&s->block_base);
3947 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3949 VP9Context *s = ctx->priv_data;
3952 for (i = 0; i < 3; i++) {
3953 if (s->s.frames[i].tf.f->buf[0])
3954 vp9_unref_frame(ctx, &s->s.frames[i]);
3955 av_frame_free(&s->s.frames[i].tf.f);
3957 for (i = 0; i < 8; i++) {
3958 if (s->s.refs[i].f->buf[0])
3959 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3960 av_frame_free(&s->s.refs[i].f);
3961 if (s->next_refs[i].f->buf[0])
3962 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3963 av_frame_free(&s->next_refs[i].f);
3973 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3974 int *got_frame, AVPacket *pkt)
3976 const uint8_t *data = pkt->data;
3977 int size = pkt->size;
3978 VP9Context *s = ctx->priv_data;
3979 int res, tile_row, tile_col, i, ref, row, col;
3980 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3981 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3982 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3986 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3988 } else if (res == 0) {
3989 if (!s->s.refs[ref].f->buf[0]) {
3990 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3991 return AVERROR_INVALIDDATA;
3993 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3995 ((AVFrame *)frame)->pkt_pts = pkt->pts;
3996 ((AVFrame *)frame)->pkt_dts = pkt->dts;
3997 for (i = 0; i < 8; i++) {
3998 if (s->next_refs[i].f->buf[0])
3999 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4000 if (s->s.refs[i].f->buf[0] &&
4001 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4010 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4011 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4012 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4013 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4014 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4017 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4018 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
4019 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4020 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4022 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4023 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4024 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4026 f = s->s.frames[CUR_FRAME].tf.f;
4027 f->key_frame = s->s.h.keyframe;
4028 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4029 ls_y = f->linesize[0];
4030 ls_uv =f->linesize[1];
4032 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4033 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4034 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4035 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4039 for (i = 0; i < 8; i++) {
4040 if (s->next_refs[i].f->buf[0])
4041 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4042 if (s->s.h.refreshrefmask & (1 << i)) {
4043 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4044 } else if (s->s.refs[i].f->buf[0]) {
4045 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4052 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4055 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4058 res = ctx->hwaccel->end_frame(ctx);
4064 // main tile decode loop
4065 bytesperpixel = s->bytesperpixel;
4066 memset(s->above_partition_ctx, 0, s->cols);
4067 memset(s->above_skip_ctx, 0, s->cols);
4068 if (s->s.h.keyframe || s->s.h.intraonly) {
4069 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4071 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4073 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4074 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4075 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4076 memset(s->above_segpred_ctx, 0, s->cols);
4077 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4078 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4079 if ((res = update_block_buffers(ctx)) < 0) {
4080 av_log(ctx, AV_LOG_ERROR,
4081 "Failed to allocate block buffers\n");
4084 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4087 for (i = 0; i < 4; i++) {
4088 for (j = 0; j < 2; j++)
4089 for (k = 0; k < 2; k++)
4090 for (l = 0; l < 6; l++)
4091 for (m = 0; m < 6; m++)
4092 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4093 s->prob.coef[i][j][k][l][m], 3);
4094 if (s->s.h.txfmmode == i)
4097 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4098 ff_thread_finish_setup(ctx);
4099 } else if (!s->s.h.refreshctx) {
4100 ff_thread_finish_setup(ctx);
4106 s->block = s->block_base;
4107 s->uvblock[0] = s->uvblock_base[0];
4108 s->uvblock[1] = s->uvblock_base[1];
4109 s->eob = s->eob_base;
4110 s->uveob[0] = s->uveob_base[0];
4111 s->uveob[1] = s->uveob_base[1];
4113 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4114 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4115 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4117 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4120 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4121 tile_row == s->s.h.tiling.tile_rows - 1) {
4124 tile_size = AV_RB32(data);
4128 if (tile_size > size) {
4129 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4130 return AVERROR_INVALIDDATA;
4132 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4133 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4134 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4135 return AVERROR_INVALIDDATA;
4142 for (row = s->tile_row_start; row < s->tile_row_end;
4143 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4144 struct VP9Filter *lflvl_ptr = s->lflvl;
4145 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4147 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4148 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4149 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4152 memset(s->left_partition_ctx, 0, 8);
4153 memset(s->left_skip_ctx, 0, 8);
4154 if (s->s.h.keyframe || s->s.h.intraonly) {
4155 memset(s->left_mode_ctx, DC_PRED, 16);
4157 memset(s->left_mode_ctx, NEARESTMV, 8);
4159 memset(s->left_y_nnz_ctx, 0, 16);
4160 memset(s->left_uv_nnz_ctx, 0, 32);
4161 memset(s->left_segpred_ctx, 0, 8);
4163 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4166 for (col = s->tile_col_start;
4167 col < s->tile_col_end;
4168 col += 8, yoff2 += 64 * bytesperpixel,
4169 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4170 // FIXME integrate with lf code (i.e. zero after each
4171 // use, similar to invtxfm coefficients, or similar)
4173 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4177 decode_sb_mem(ctx, row, col, lflvl_ptr,
4178 yoff2, uvoff2, BL_64X64);
4180 decode_sb(ctx, row, col, lflvl_ptr,
4181 yoff2, uvoff2, BL_64X64);
4185 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4193 // backup pre-loopfilter reconstruction data for intra
4194 // prediction of next row of sb64s
4195 if (row + 8 < s->rows) {
4196 memcpy(s->intra_pred_data[0],
4197 f->data[0] + yoff + 63 * ls_y,
4198 8 * s->cols * bytesperpixel);
4199 memcpy(s->intra_pred_data[1],
4200 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4201 8 * s->cols * bytesperpixel >> s->ss_h);
4202 memcpy(s->intra_pred_data[2],
4203 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4204 8 * s->cols * bytesperpixel >> s->ss_h);
4207 // loopfilter one row
4208 if (s->s.h.filter.level) {
4211 lflvl_ptr = s->lflvl;
4212 for (col = 0; col < s->cols;
4213 col += 8, yoff2 += 64 * bytesperpixel,
4214 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4215 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4219 // FIXME maybe we can make this more finegrained by running the
4220 // loopfilter per-block instead of after each sbrow
4221 // In fact that would also make intra pred left preparation easier?
4222 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4226 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4228 ff_thread_finish_setup(ctx);
4230 } while (s->pass++ == 1);
4231 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4235 for (i = 0; i < 8; i++) {
4236 if (s->s.refs[i].f->buf[0])
4237 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4238 if (s->next_refs[i].f->buf[0] &&
4239 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4243 if (!s->s.h.invisible) {
4244 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4252 static void vp9_decode_flush(AVCodecContext *ctx)
4254 VP9Context *s = ctx->priv_data;
4257 for (i = 0; i < 3; i++)
4258 vp9_unref_frame(ctx, &s->s.frames[i]);
4259 for (i = 0; i < 8; i++)
4260 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4263 static int init_frames(AVCodecContext *ctx)
4265 VP9Context *s = ctx->priv_data;
4268 for (i = 0; i < 3; i++) {
4269 s->s.frames[i].tf.f = av_frame_alloc();
4270 if (!s->s.frames[i].tf.f) {
4271 vp9_decode_free(ctx);
4272 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4273 return AVERROR(ENOMEM);
4276 for (i = 0; i < 8; i++) {
4277 s->s.refs[i].f = av_frame_alloc();
4278 s->next_refs[i].f = av_frame_alloc();
4279 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4280 vp9_decode_free(ctx);
4281 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4282 return AVERROR(ENOMEM);
4289 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4291 VP9Context *s = ctx->priv_data;
4293 ctx->internal->allocate_progress = 1;
4295 s->s.h.filter.sharpness = -1;
4297 return init_frames(ctx);
4301 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4303 return init_frames(avctx);
4306 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4309 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4311 for (i = 0; i < 3; i++) {
4312 if (s->s.frames[i].tf.f->buf[0])
4313 vp9_unref_frame(dst, &s->s.frames[i]);
4314 if (ssrc->s.frames[i].tf.f->buf[0]) {
4315 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4319 for (i = 0; i < 8; i++) {
4320 if (s->s.refs[i].f->buf[0])
4321 ff_thread_release_buffer(dst, &s->s.refs[i]);
4322 if (ssrc->next_refs[i].f->buf[0]) {
4323 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4328 s->s.h.invisible = ssrc->s.h.invisible;
4329 s->s.h.keyframe = ssrc->s.h.keyframe;
4330 s->s.h.intraonly = ssrc->s.h.intraonly;
4331 s->ss_v = ssrc->ss_v;
4332 s->ss_h = ssrc->ss_h;
4333 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4334 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4335 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4336 s->bytesperpixel = ssrc->bytesperpixel;
4337 s->gf_fmt = ssrc->gf_fmt;
4341 s->bpp_index = ssrc->bpp_index;
4342 s->pix_fmt = ssrc->pix_fmt;
4343 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4344 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4345 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4346 sizeof(s->s.h.segmentation.feat));
4352 AVCodec ff_vp9_decoder = {
4354 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4355 .type = AVMEDIA_TYPE_VIDEO,
4356 .id = AV_CODEC_ID_VP9,
4357 .priv_data_size = sizeof(VP9Context),
4358 .init = vp9_decode_init,
4359 .close = vp9_decode_free,
4360 .decode = vp9_decode_frame,
4361 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4362 .flush = vp9_decode_flush,
4363 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4364 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4365 .profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),