2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
37 #define VP9_SYNCCODE 0x498342
41 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
45 typedef struct VP9Block {
46 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
47 enum FilterMode filter;
48 VP56mv mv[4 /* b_idx */][2 /* ref */];
50 enum TxfmMode tx, uvtx;
52 enum BlockPartition bp;
55 typedef struct VP9Context {
66 int row, row7, col, col7;
68 ptrdiff_t y_stride, uv_stride;
71 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
72 uint8_t last_keyframe;
73 // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74 // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75 // and are therefore per-stream. pix_fmt represents the value in the header
76 // of the currently processed frame.
78 enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79 unsigned sb_cols, sb_rows, rows, cols;
80 ThreadFrame next_refs[8];
84 uint8_t mblim_lut[64];
86 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
89 uint8_t coef[4][2][2][6][6][3];
93 uint8_t coef[4][2][2][6][6][11];
96 unsigned y_mode[4][10];
97 unsigned uv_mode[10][10];
98 unsigned filter[4][3];
99 unsigned mv_mode[7][4];
100 unsigned intra[4][2];
102 unsigned single_ref[5][2][2];
103 unsigned comp_ref[5][2];
104 unsigned tx32p[2][4];
105 unsigned tx16p[2][3];
108 unsigned mv_joint[4];
111 unsigned classes[11];
113 unsigned bits[10][2];
114 unsigned class0_fp[2][4];
116 unsigned class0_hp[2];
119 unsigned partition[4][4][4];
120 unsigned coef[4][2][2][6][6][3];
121 unsigned eob[4][2][2][6][6][2];
124 // contextual (left/above) cache
125 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
126 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
127 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
128 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
129 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
131 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
132 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
133 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
134 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
135 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
136 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
137 uint8_t *above_partition_ctx;
138 uint8_t *above_mode_ctx;
139 // FIXME maybe merge some of the below in a flags field?
140 uint8_t *above_y_nnz_ctx;
141 uint8_t *above_uv_nnz_ctx[2];
142 uint8_t *above_skip_ctx; // 1bit
143 uint8_t *above_txfm_ctx; // 2bit
144 uint8_t *above_segpred_ctx; // 1bit
145 uint8_t *above_intra_ctx; // 1bit
146 uint8_t *above_comp_ctx; // 1bit
147 uint8_t *above_ref_ctx; // 2bit
148 uint8_t *above_filter_ctx;
149 VP56mv (*above_mv_ctx)[2];
152 uint8_t *intra_pred_data[3];
153 struct VP9Filter *lflvl;
154 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
156 // block reconstruction intermediates
157 int block_alloc_using_2pass;
158 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
159 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
160 struct { int x, y; } min_mv, max_mv;
161 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163 uint16_t mvscale[3][2];
164 uint8_t mvstep[3][2];
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
169 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
172 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
177 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
179 ff_thread_release_buffer(ctx, &f->tf);
180 av_buffer_unref(&f->extradata);
181 av_buffer_unref(&f->hwaccel_priv_buf);
182 f->segmentation_map = NULL;
183 f->hwaccel_picture_private = NULL;
186 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
188 VP9Context *s = ctx->priv_data;
191 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
193 sz = 64 * s->sb_cols * s->sb_rows;
194 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
198 f->segmentation_map = f->extradata->data;
199 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
202 const AVHWAccel *hwaccel = ctx->hwaccel;
203 av_assert0(!f->hwaccel_picture_private);
204 if (hwaccel->frame_priv_data_size) {
205 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
206 if (!f->hwaccel_priv_buf)
208 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
215 vp9_unref_frame(ctx, f);
216 return AVERROR(ENOMEM);
219 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
223 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
225 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
229 dst->segmentation_map = src->segmentation_map;
231 dst->uses_2pass = src->uses_2pass;
233 if (src->hwaccel_picture_private) {
234 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
235 if (!dst->hwaccel_priv_buf)
237 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
243 vp9_unref_frame(ctx, dst);
244 return AVERROR(ENOMEM);
247 static int update_size(AVCodecContext *ctx, int w, int h)
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251 VP9Context *s = ctx->priv_data;
253 int bytesperpixel = s->bytesperpixel, res, cols, rows;
255 av_assert0(w > 0 && h > 0);
257 if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
261 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
262 #if CONFIG_VP9_DXVA2_HWACCEL
263 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
265 #if CONFIG_VP9_D3D11VA_HWACCEL
266 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
268 #if CONFIG_VP9_VAAPI_HWACCEL
269 *fmtp++ = AV_PIX_FMT_VAAPI;
273 *fmtp++ = s->pix_fmt;
274 *fmtp = AV_PIX_FMT_NONE;
276 res = ff_thread_get_format(ctx, pix_fmts);
281 s->gf_fmt = s->pix_fmt;
289 if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
292 s->last_fmt = s->pix_fmt;
293 s->sb_cols = (w + 63) >> 6;
294 s->sb_rows = (h + 63) >> 6;
295 s->cols = (w + 7) >> 3;
296 s->rows = (h + 7) >> 3;
298 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
299 av_freep(&s->intra_pred_data[0]);
300 // FIXME we slightly over-allocate here for subsampled chroma, but a little
301 // bit of padding shouldn't affect performance...
302 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
303 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
305 return AVERROR(ENOMEM);
306 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
307 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
308 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
309 assign(s->above_y_nnz_ctx, uint8_t *, 16);
310 assign(s->above_mode_ctx, uint8_t *, 16);
311 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
312 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
313 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
314 assign(s->above_partition_ctx, uint8_t *, 8);
315 assign(s->above_skip_ctx, uint8_t *, 8);
316 assign(s->above_txfm_ctx, uint8_t *, 8);
317 assign(s->above_segpred_ctx, uint8_t *, 8);
318 assign(s->above_intra_ctx, uint8_t *, 8);
319 assign(s->above_comp_ctx, uint8_t *, 8);
320 assign(s->above_ref_ctx, uint8_t *, 8);
321 assign(s->above_filter_ctx, uint8_t *, 8);
322 assign(s->lflvl, struct VP9Filter *, 1);
325 // these will be re-allocated a little later
326 av_freep(&s->b_base);
327 av_freep(&s->block_base);
329 if (s->bpp != s->last_bpp) {
330 ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
331 ff_videodsp_init(&s->vdsp, s->bpp);
332 s->last_bpp = s->bpp;
338 static int update_block_buffers(AVCodecContext *ctx)
340 VP9Context *s = ctx->priv_data;
341 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
343 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
347 av_free(s->block_base);
348 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
349 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
350 if (s->s.frames[CUR_FRAME].uses_2pass) {
351 int sbs = s->sb_cols * s->sb_rows;
353 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
354 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
355 16 * 16 + 2 * chroma_eobs) * sbs);
356 if (!s->b_base || !s->block_base)
357 return AVERROR(ENOMEM);
358 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
359 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
360 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
361 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
362 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
364 s->b_base = av_malloc(sizeof(VP9Block));
365 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
366 16 * 16 + 2 * chroma_eobs);
367 if (!s->b_base || !s->block_base)
368 return AVERROR(ENOMEM);
369 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
370 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
371 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
372 s->uveob_base[0] = s->eob_base + 16 * 16;
373 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
375 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
380 // for some reason the sign bit is at the end, not the start, of a bit sequence
381 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
383 int v = get_bits(gb, n);
384 return get_bits1(gb) ? -v : v;
387 static av_always_inline int inv_recenter_nonneg(int v, int m)
389 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
392 // differential forward probability updates
393 static int update_prob(VP56RangeCoder *c, int p)
395 static const int inv_map_table[255] = {
396 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
397 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
398 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
399 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
400 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
401 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
402 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
403 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
404 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
405 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
406 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
407 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
408 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
409 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
410 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
411 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
412 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
413 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
418 /* This code is trying to do a differential probability update. For a
419 * current probability A in the range [1, 255], the difference to a new
420 * probability of any value can be expressed differentially as 1-A,255-A
421 * where some part of this (absolute range) exists both in positive as
422 * well as the negative part, whereas another part only exists in one
423 * half. We're trying to code this shared part differentially, i.e.
424 * times two where the value of the lowest bit specifies the sign, and
425 * the single part is then coded on top of this. This absolute difference
426 * then again has a value of [0,254], but a bigger value in this range
427 * indicates that we're further away from the original value A, so we
428 * can code this as a VLC code, since higher values are increasingly
429 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
430 * updates vs. the 'fine, exact' updates further down the range, which
431 * adds one extra dimension to this differential update model. */
433 if (!vp8_rac_get(c)) {
434 d = vp8_rac_get_uint(c, 4) + 0;
435 } else if (!vp8_rac_get(c)) {
436 d = vp8_rac_get_uint(c, 4) + 16;
437 } else if (!vp8_rac_get(c)) {
438 d = vp8_rac_get_uint(c, 5) + 32;
440 d = vp8_rac_get_uint(c, 7);
442 d = (d << 1) - 65 + vp8_rac_get(c);
444 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
447 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
448 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
451 static int read_colorspace_details(AVCodecContext *ctx)
453 static const enum AVColorSpace colorspaces[8] = {
454 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
455 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
457 VP9Context *s = ctx->priv_data;
458 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
461 s->bpp = 8 + bits * 2;
462 s->bytesperpixel = (7 + s->bpp) >> 3;
463 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
464 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
465 static const enum AVPixelFormat pix_fmt_rgb[3] = {
466 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
468 s->ss_h = s->ss_v = 0;
469 ctx->color_range = AVCOL_RANGE_JPEG;
470 s->pix_fmt = pix_fmt_rgb[bits];
471 if (ctx->profile & 1) {
472 if (get_bits1(&s->gb)) {
473 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
474 return AVERROR_INVALIDDATA;
477 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
479 return AVERROR_INVALIDDATA;
482 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
483 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
484 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
485 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
486 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
487 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
488 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
490 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
491 if (ctx->profile & 1) {
492 s->ss_h = get_bits1(&s->gb);
493 s->ss_v = get_bits1(&s->gb);
494 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
495 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
496 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
498 return AVERROR_INVALIDDATA;
499 } else if (get_bits1(&s->gb)) {
500 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
502 return AVERROR_INVALIDDATA;
505 s->ss_h = s->ss_v = 1;
506 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
513 static int decode_frame_header(AVCodecContext *ctx,
514 const uint8_t *data, int size, int *ref)
516 VP9Context *s = ctx->priv_data;
517 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
519 const uint8_t *data2;
522 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
523 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
526 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
527 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
528 return AVERROR_INVALIDDATA;
530 ctx->profile = get_bits1(&s->gb);
531 ctx->profile |= get_bits1(&s->gb) << 1;
532 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
533 if (ctx->profile > 3) {
534 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
535 return AVERROR_INVALIDDATA;
537 s->s.h.profile = ctx->profile;
538 if (get_bits1(&s->gb)) {
539 *ref = get_bits(&s->gb, 3);
542 s->last_keyframe = s->s.h.keyframe;
543 s->s.h.keyframe = !get_bits1(&s->gb);
544 last_invisible = s->s.h.invisible;
545 s->s.h.invisible = !get_bits1(&s->gb);
546 s->s.h.errorres = get_bits1(&s->gb);
547 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
548 if (s->s.h.keyframe) {
549 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
550 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
551 return AVERROR_INVALIDDATA;
553 if ((res = read_colorspace_details(ctx)) < 0)
555 // for profile 1, here follows the subsampling bits
556 s->s.h.refreshrefmask = 0xff;
557 w = get_bits(&s->gb, 16) + 1;
558 h = get_bits(&s->gb, 16) + 1;
559 if (get_bits1(&s->gb)) // display size
560 skip_bits(&s->gb, 32);
562 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
563 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
564 if (s->s.h.intraonly) {
565 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
566 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
567 return AVERROR_INVALIDDATA;
569 if (ctx->profile >= 1) {
570 if ((res = read_colorspace_details(ctx)) < 0)
573 s->ss_h = s->ss_v = 1;
576 s->bytesperpixel = 1;
577 s->pix_fmt = AV_PIX_FMT_YUV420P;
578 ctx->colorspace = AVCOL_SPC_BT470BG;
579 ctx->color_range = AVCOL_RANGE_JPEG;
581 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
582 w = get_bits(&s->gb, 16) + 1;
583 h = get_bits(&s->gb, 16) + 1;
584 if (get_bits1(&s->gb)) // display size
585 skip_bits(&s->gb, 32);
587 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
588 s->s.h.refidx[0] = get_bits(&s->gb, 3);
589 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
590 s->s.h.refidx[1] = get_bits(&s->gb, 3);
591 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
592 s->s.h.refidx[2] = get_bits(&s->gb, 3);
593 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
594 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
595 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
596 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
597 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
598 return AVERROR_INVALIDDATA;
600 if (get_bits1(&s->gb)) {
601 w = s->s.refs[s->s.h.refidx[0]].f->width;
602 h = s->s.refs[s->s.h.refidx[0]].f->height;
603 } else if (get_bits1(&s->gb)) {
604 w = s->s.refs[s->s.h.refidx[1]].f->width;
605 h = s->s.refs[s->s.h.refidx[1]].f->height;
606 } else if (get_bits1(&s->gb)) {
607 w = s->s.refs[s->s.h.refidx[2]].f->width;
608 h = s->s.refs[s->s.h.refidx[2]].f->height;
610 w = get_bits(&s->gb, 16) + 1;
611 h = get_bits(&s->gb, 16) + 1;
613 // Note that in this code, "CUR_FRAME" is actually before we
614 // have formally allocated a frame, and thus actually represents
616 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
617 s->s.frames[CUR_FRAME].tf.f->height == h;
618 if (get_bits1(&s->gb)) // display size
619 skip_bits(&s->gb, 32);
620 s->s.h.highprecisionmvs = get_bits1(&s->gb);
621 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
623 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
624 s->s.h.signbias[0] != s->s.h.signbias[2];
625 if (s->s.h.allowcompinter) {
626 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
627 s->s.h.fixcompref = 2;
628 s->s.h.varcompref[0] = 0;
629 s->s.h.varcompref[1] = 1;
630 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
631 s->s.h.fixcompref = 1;
632 s->s.h.varcompref[0] = 0;
633 s->s.h.varcompref[1] = 2;
635 s->s.h.fixcompref = 0;
636 s->s.h.varcompref[0] = 1;
637 s->s.h.varcompref[1] = 2;
642 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
643 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
644 s->s.h.framectxid = c = get_bits(&s->gb, 2);
645 if (s->s.h.keyframe || s->s.h.intraonly)
646 s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
648 /* loopfilter header data */
649 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
650 // reset loopfilter defaults
651 s->s.h.lf_delta.ref[0] = 1;
652 s->s.h.lf_delta.ref[1] = 0;
653 s->s.h.lf_delta.ref[2] = -1;
654 s->s.h.lf_delta.ref[3] = -1;
655 s->s.h.lf_delta.mode[0] = 0;
656 s->s.h.lf_delta.mode[1] = 0;
657 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
659 s->s.h.filter.level = get_bits(&s->gb, 6);
660 sharp = get_bits(&s->gb, 3);
661 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
662 // the old cache values since they are still valid
663 if (s->s.h.filter.sharpness != sharp)
664 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
665 s->s.h.filter.sharpness = sharp;
666 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
667 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
668 for (i = 0; i < 4; i++)
669 if (get_bits1(&s->gb))
670 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
671 for (i = 0; i < 2; i++)
672 if (get_bits1(&s->gb))
673 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
677 /* quantization header data */
678 s->s.h.yac_qi = get_bits(&s->gb, 8);
679 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
680 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
681 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
682 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
683 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
685 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
687 /* segmentation header info */
688 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
689 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
690 for (i = 0; i < 7; i++)
691 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
692 get_bits(&s->gb, 8) : 255;
693 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
694 for (i = 0; i < 3; i++)
695 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
696 get_bits(&s->gb, 8) : 255;
700 if (get_bits1(&s->gb)) {
701 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
702 for (i = 0; i < 8; i++) {
703 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
704 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
705 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
706 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
707 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
708 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
709 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
714 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
715 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
716 int qyac, qydc, quvac, quvdc, lflvl, sh;
718 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
719 if (s->s.h.segmentation.absolute_vals)
720 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
722 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
724 qyac = s->s.h.yac_qi;
726 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
727 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
728 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
729 qyac = av_clip_uintp2(qyac, 8);
731 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
732 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
733 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
734 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
736 sh = s->s.h.filter.level >= 32;
737 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
738 if (s->s.h.segmentation.absolute_vals)
739 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
741 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
743 lflvl = s->s.h.filter.level;
745 if (s->s.h.lf_delta.enabled) {
746 s->s.h.segmentation.feat[i].lflvl[0][0] =
747 s->s.h.segmentation.feat[i].lflvl[0][1] =
748 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
749 for (j = 1; j < 4; j++) {
750 s->s.h.segmentation.feat[i].lflvl[j][0] =
751 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
752 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
753 s->s.h.segmentation.feat[i].lflvl[j][1] =
754 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
755 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
758 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
759 sizeof(s->s.h.segmentation.feat[i].lflvl));
764 if ((res = update_size(ctx, w, h)) < 0) {
765 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
769 for (s->s.h.tiling.log2_tile_cols = 0;
770 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
771 s->s.h.tiling.log2_tile_cols++) ;
772 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
773 max = FFMAX(0, max - 1);
774 while (max > s->s.h.tiling.log2_tile_cols) {
775 if (get_bits1(&s->gb))
776 s->s.h.tiling.log2_tile_cols++;
780 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
781 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
782 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
783 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
784 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
785 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
787 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
788 return AVERROR(ENOMEM);
792 /* check reference frames */
793 if (!s->s.h.keyframe && !s->s.h.intraonly) {
794 for (i = 0; i < 3; i++) {
795 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
796 int refw = ref->width, refh = ref->height;
798 if (ref->format != ctx->pix_fmt) {
799 av_log(ctx, AV_LOG_ERROR,
800 "Ref pixfmt (%s) did not match current frame (%s)",
801 av_get_pix_fmt_name(ref->format),
802 av_get_pix_fmt_name(ctx->pix_fmt));
803 return AVERROR_INVALIDDATA;
804 } else if (refw == w && refh == h) {
805 s->mvscale[i][0] = s->mvscale[i][1] = 0;
807 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
808 av_log(ctx, AV_LOG_ERROR,
809 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
811 return AVERROR_INVALIDDATA;
813 s->mvscale[i][0] = (refw << 14) / w;
814 s->mvscale[i][1] = (refh << 14) / h;
815 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
816 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
821 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
822 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
823 s->prob_ctx[3].p = vp9_default_probs;
824 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
825 sizeof(vp9_default_coef_probs));
826 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
827 sizeof(vp9_default_coef_probs));
828 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
829 sizeof(vp9_default_coef_probs));
830 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
831 sizeof(vp9_default_coef_probs));
832 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
833 s->prob_ctx[c].p = vp9_default_probs;
834 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
835 sizeof(vp9_default_coef_probs));
838 // next 16 bits is size of the rest of the header (arith-coded)
839 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
840 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
842 data2 = align_get_bits(&s->gb);
843 if (size2 > size - (data2 - data)) {
844 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
845 return AVERROR_INVALIDDATA;
847 ff_vp56_init_range_decoder(&s->c, data2, size2);
848 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
849 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
850 return AVERROR_INVALIDDATA;
853 if (s->s.h.keyframe || s->s.h.intraonly) {
854 memset(s->counts.coef, 0, sizeof(s->counts.coef));
855 memset(s->counts.eob, 0, sizeof(s->counts.eob));
857 memset(&s->counts, 0, sizeof(s->counts));
859 // FIXME is it faster to not copy here, but do it down in the fw updates
860 // as explicit copies if the fw update is missing (and skip the copy upon
862 s->prob.p = s->prob_ctx[c].p;
865 if (s->s.h.lossless) {
866 s->s.h.txfmmode = TX_4X4;
868 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
869 if (s->s.h.txfmmode == 3)
870 s->s.h.txfmmode += vp8_rac_get(&s->c);
872 if (s->s.h.txfmmode == TX_SWITCHABLE) {
873 for (i = 0; i < 2; i++)
874 if (vp56_rac_get_prob_branchy(&s->c, 252))
875 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
876 for (i = 0; i < 2; i++)
877 for (j = 0; j < 2; j++)
878 if (vp56_rac_get_prob_branchy(&s->c, 252))
879 s->prob.p.tx16p[i][j] =
880 update_prob(&s->c, s->prob.p.tx16p[i][j]);
881 for (i = 0; i < 2; i++)
882 for (j = 0; j < 3; j++)
883 if (vp56_rac_get_prob_branchy(&s->c, 252))
884 s->prob.p.tx32p[i][j] =
885 update_prob(&s->c, s->prob.p.tx32p[i][j]);
890 for (i = 0; i < 4; i++) {
891 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
892 if (vp8_rac_get(&s->c)) {
893 for (j = 0; j < 2; j++)
894 for (k = 0; k < 2; k++)
895 for (l = 0; l < 6; l++)
896 for (m = 0; m < 6; m++) {
897 uint8_t *p = s->prob.coef[i][j][k][l][m];
898 uint8_t *r = ref[j][k][l][m];
899 if (m >= 3 && l == 0) // dc only has 3 pt
901 for (n = 0; n < 3; n++) {
902 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
903 p[n] = update_prob(&s->c, r[n]);
911 for (j = 0; j < 2; j++)
912 for (k = 0; k < 2; k++)
913 for (l = 0; l < 6; l++)
914 for (m = 0; m < 6; m++) {
915 uint8_t *p = s->prob.coef[i][j][k][l][m];
916 uint8_t *r = ref[j][k][l][m];
917 if (m > 3 && l == 0) // dc only has 3 pt
923 if (s->s.h.txfmmode == i)
928 for (i = 0; i < 3; i++)
929 if (vp56_rac_get_prob_branchy(&s->c, 252))
930 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
931 if (!s->s.h.keyframe && !s->s.h.intraonly) {
932 for (i = 0; i < 7; i++)
933 for (j = 0; j < 3; j++)
934 if (vp56_rac_get_prob_branchy(&s->c, 252))
935 s->prob.p.mv_mode[i][j] =
936 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
938 if (s->s.h.filtermode == FILTER_SWITCHABLE)
939 for (i = 0; i < 4; i++)
940 for (j = 0; j < 2; j++)
941 if (vp56_rac_get_prob_branchy(&s->c, 252))
942 s->prob.p.filter[i][j] =
943 update_prob(&s->c, s->prob.p.filter[i][j]);
945 for (i = 0; i < 4; i++)
946 if (vp56_rac_get_prob_branchy(&s->c, 252))
947 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
949 if (s->s.h.allowcompinter) {
950 s->s.h.comppredmode = vp8_rac_get(&s->c);
951 if (s->s.h.comppredmode)
952 s->s.h.comppredmode += vp8_rac_get(&s->c);
953 if (s->s.h.comppredmode == PRED_SWITCHABLE)
954 for (i = 0; i < 5; i++)
955 if (vp56_rac_get_prob_branchy(&s->c, 252))
957 update_prob(&s->c, s->prob.p.comp[i]);
959 s->s.h.comppredmode = PRED_SINGLEREF;
962 if (s->s.h.comppredmode != PRED_COMPREF) {
963 for (i = 0; i < 5; i++) {
964 if (vp56_rac_get_prob_branchy(&s->c, 252))
965 s->prob.p.single_ref[i][0] =
966 update_prob(&s->c, s->prob.p.single_ref[i][0]);
967 if (vp56_rac_get_prob_branchy(&s->c, 252))
968 s->prob.p.single_ref[i][1] =
969 update_prob(&s->c, s->prob.p.single_ref[i][1]);
973 if (s->s.h.comppredmode != PRED_SINGLEREF) {
974 for (i = 0; i < 5; i++)
975 if (vp56_rac_get_prob_branchy(&s->c, 252))
976 s->prob.p.comp_ref[i] =
977 update_prob(&s->c, s->prob.p.comp_ref[i]);
980 for (i = 0; i < 4; i++)
981 for (j = 0; j < 9; j++)
982 if (vp56_rac_get_prob_branchy(&s->c, 252))
983 s->prob.p.y_mode[i][j] =
984 update_prob(&s->c, s->prob.p.y_mode[i][j]);
986 for (i = 0; i < 4; i++)
987 for (j = 0; j < 4; j++)
988 for (k = 0; k < 3; k++)
989 if (vp56_rac_get_prob_branchy(&s->c, 252))
990 s->prob.p.partition[3 - i][j][k] =
991 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
993 // mv fields don't use the update_prob subexp model for some reason
994 for (i = 0; i < 3; i++)
995 if (vp56_rac_get_prob_branchy(&s->c, 252))
996 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
998 for (i = 0; i < 2; i++) {
999 if (vp56_rac_get_prob_branchy(&s->c, 252))
1000 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1002 for (j = 0; j < 10; j++)
1003 if (vp56_rac_get_prob_branchy(&s->c, 252))
1004 s->prob.p.mv_comp[i].classes[j] =
1005 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1007 if (vp56_rac_get_prob_branchy(&s->c, 252))
1008 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1010 for (j = 0; j < 10; j++)
1011 if (vp56_rac_get_prob_branchy(&s->c, 252))
1012 s->prob.p.mv_comp[i].bits[j] =
1013 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1016 for (i = 0; i < 2; i++) {
1017 for (j = 0; j < 2; j++)
1018 for (k = 0; k < 3; k++)
1019 if (vp56_rac_get_prob_branchy(&s->c, 252))
1020 s->prob.p.mv_comp[i].class0_fp[j][k] =
1021 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1023 for (j = 0; j < 3; j++)
1024 if (vp56_rac_get_prob_branchy(&s->c, 252))
1025 s->prob.p.mv_comp[i].fp[j] =
1026 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1029 if (s->s.h.highprecisionmvs) {
1030 for (i = 0; i < 2; i++) {
1031 if (vp56_rac_get_prob_branchy(&s->c, 252))
1032 s->prob.p.mv_comp[i].class0_hp =
1033 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1035 if (vp56_rac_get_prob_branchy(&s->c, 252))
1036 s->prob.p.mv_comp[i].hp =
1037 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1042 return (data2 - data) + size2;
1045 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1048 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1049 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1052 static void find_ref_mvs(VP9Context *s,
1053 VP56mv *pmv, int ref, int z, int idx, int sb)
1055 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1056 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1057 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1058 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1059 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1060 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1061 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1062 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1063 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1064 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1065 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1066 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1067 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1068 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1069 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1070 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1071 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1072 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1073 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1074 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1075 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1076 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1077 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1078 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1079 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1080 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1081 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1084 int row = s->row, col = s->col, row7 = s->row7;
1085 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1086 #define INVALID_MV 0x80008000U
1087 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1090 #define RETURN_DIRECT_MV(mv) \
1092 uint32_t m = AV_RN32A(&mv); \
1096 } else if (mem == INVALID_MV) { \
1098 } else if (m != mem) { \
1105 if (sb == 2 || sb == 1) {
1106 RETURN_DIRECT_MV(b->mv[0][z]);
1107 } else if (sb == 3) {
1108 RETURN_DIRECT_MV(b->mv[2][z]);
1109 RETURN_DIRECT_MV(b->mv[1][z]);
1110 RETURN_DIRECT_MV(b->mv[0][z]);
1113 #define RETURN_MV(mv) \
1118 av_assert2(idx == 1); \
1119 av_assert2(mem != INVALID_MV); \
1120 if (mem_sub8x8 == INVALID_MV) { \
1121 clamp_mv(&tmp, &mv, s); \
1122 m = AV_RN32A(&tmp); \
1127 mem_sub8x8 = AV_RN32A(&mv); \
1128 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1129 clamp_mv(&tmp, &mv, s); \
1130 m = AV_RN32A(&tmp); \
1134 /* BUG I'm pretty sure this isn't the intention */ \
1140 uint32_t m = AV_RN32A(&mv); \
1142 clamp_mv(pmv, &mv, s); \
1144 } else if (mem == INVALID_MV) { \
1146 } else if (m != mem) { \
1147 clamp_mv(pmv, &mv, s); \
1154 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1155 if (mv->ref[0] == ref) {
1156 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1157 } else if (mv->ref[1] == ref) {
1158 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1161 if (col > s->tile_col_start) {
1162 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1163 if (mv->ref[0] == ref) {
1164 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1165 } else if (mv->ref[1] == ref) {
1166 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1174 // previously coded MVs in this neighbourhood, using same reference frame
1175 for (; i < 8; i++) {
1176 int c = p[i][0] + col, r = p[i][1] + row;
1178 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1179 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1181 if (mv->ref[0] == ref) {
1182 RETURN_MV(mv->mv[0]);
1183 } else if (mv->ref[1] == ref) {
1184 RETURN_MV(mv->mv[1]);
1189 // MV at this position in previous frame, using same reference frame
1190 if (s->s.h.use_last_frame_mvs) {
1191 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1193 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1194 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1195 if (mv->ref[0] == ref) {
1196 RETURN_MV(mv->mv[0]);
1197 } else if (mv->ref[1] == ref) {
1198 RETURN_MV(mv->mv[1]);
1202 #define RETURN_SCALE_MV(mv, scale) \
1205 VP56mv mv_temp = { -mv.x, -mv.y }; \
1206 RETURN_MV(mv_temp); \
1212 // previously coded MVs in this neighbourhood, using different reference frame
1213 for (i = 0; i < 8; i++) {
1214 int c = p[i][0] + col, r = p[i][1] + row;
1216 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1217 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1219 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1220 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1222 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1223 // BUG - libvpx has this condition regardless of whether
1224 // we used the first ref MV and pre-scaling
1225 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1226 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1231 // MV at this position in previous frame, using different reference frame
1232 if (s->s.h.use_last_frame_mvs) {
1233 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1235 // no need to await_progress, because we already did that above
1236 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1237 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1239 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1240 // BUG - libvpx has this condition regardless of whether
1241 // we used the first ref MV and pre-scaling
1242 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1243 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1248 clamp_mv(pmv, pmv, s);
1251 #undef RETURN_SCALE_MV
1254 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1256 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1257 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1258 s->prob.p.mv_comp[idx].classes);
1260 s->counts.mv_comp[idx].sign[sign]++;
1261 s->counts.mv_comp[idx].classes[c]++;
1265 for (n = 0, m = 0; m < c; m++) {
1266 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1268 s->counts.mv_comp[idx].bits[m][bit]++;
1271 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1273 s->counts.mv_comp[idx].fp[bit]++;
1275 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1276 s->counts.mv_comp[idx].hp[bit]++;
1280 // bug in libvpx - we count for bw entropy purposes even if the
1282 s->counts.mv_comp[idx].hp[1]++;
1286 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1287 s->counts.mv_comp[idx].class0[n]++;
1288 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1289 s->prob.p.mv_comp[idx].class0_fp[n]);
1290 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1291 n = (n << 3) | (bit << 1);
1293 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1294 s->counts.mv_comp[idx].class0_hp[bit]++;
1298 // bug in libvpx - we count for bw entropy purposes even if the
1300 s->counts.mv_comp[idx].class0_hp[1]++;
1304 return sign ? -(n + 1) : (n + 1);
1307 static void fill_mv(VP9Context *s,
1308 VP56mv *mv, int mode, int sb)
1312 if (mode == ZEROMV) {
1317 // FIXME cache this value and reuse for other subblocks
1318 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1319 mode == NEWMV ? -1 : sb);
1320 // FIXME maybe move this code into find_ref_mvs()
1321 if ((mode == NEWMV || sb == -1) &&
1322 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1336 if (mode == NEWMV) {
1337 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1338 s->prob.p.mv_joint);
1340 s->counts.mv_joint[j]++;
1341 if (j >= MV_JOINT_V)
1342 mv[0].y += read_mv_component(s, 0, hp);
1344 mv[0].x += read_mv_component(s, 1, hp);
1348 // FIXME cache this value and reuse for other subblocks
1349 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1350 mode == NEWMV ? -1 : sb);
1351 if ((mode == NEWMV || sb == -1) &&
1352 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1366 if (mode == NEWMV) {
1367 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1368 s->prob.p.mv_joint);
1370 s->counts.mv_joint[j]++;
1371 if (j >= MV_JOINT_V)
1372 mv[1].y += read_mv_component(s, 0, hp);
1374 mv[1].x += read_mv_component(s, 1, hp);
1380 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1381 ptrdiff_t stride, int v)
1391 int v16 = v * 0x0101;
1399 uint32_t v32 = v * 0x01010101;
1408 uint64_t v64 = v * 0x0101010101010101ULL;
1414 uint32_t v32 = v * 0x01010101;
1417 AV_WN32A(ptr + 4, v32);
1426 static void decode_mode(AVCodecContext *ctx)
1428 static const uint8_t left_ctx[N_BS_SIZES] = {
1429 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1431 static const uint8_t above_ctx[N_BS_SIZES] = {
1432 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1434 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1435 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1436 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1438 VP9Context *s = ctx->priv_data;
1440 int row = s->row, col = s->col, row7 = s->row7;
1441 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1442 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1443 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1444 int have_a = row > 0, have_l = col > s->tile_col_start;
1445 int vref, filter_id;
1447 if (!s->s.h.segmentation.enabled) {
1449 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1450 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1451 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1452 } else if (!s->s.h.segmentation.update_map ||
1453 (s->s.h.segmentation.temporal &&
1454 vp56_rac_get_prob_branchy(&s->c,
1455 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1456 s->left_segpred_ctx[row7]]))) {
1457 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1459 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1461 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1462 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1463 for (y = 0; y < h4; y++) {
1464 int idx_base = (y + row) * 8 * s->sb_cols + col;
1465 for (x = 0; x < w4; x++)
1466 pred = FFMIN(pred, refsegmap[idx_base + x]);
1468 av_assert1(pred < 8);
1474 memset(&s->above_segpred_ctx[col], 1, w4);
1475 memset(&s->left_segpred_ctx[row7], 1, h4);
1477 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1478 s->s.h.segmentation.prob);
1480 memset(&s->above_segpred_ctx[col], 0, w4);
1481 memset(&s->left_segpred_ctx[row7], 0, h4);
1483 if (s->s.h.segmentation.enabled &&
1484 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1485 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1486 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1489 b->skip = s->s.h.segmentation.enabled &&
1490 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1492 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1493 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1494 s->counts.skip[c][b->skip]++;
1497 if (s->s.h.keyframe || s->s.h.intraonly) {
1499 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1500 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1504 if (have_a && have_l) {
1505 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1508 c = have_a ? 2 * s->above_intra_ctx[col] :
1509 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1511 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1512 s->counts.intra[c][bit]++;
1516 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1520 c = (s->above_skip_ctx[col] ? max_tx :
1521 s->above_txfm_ctx[col]) +
1522 (s->left_skip_ctx[row7] ? max_tx :
1523 s->left_txfm_ctx[row7]) > max_tx;
1525 c = s->above_skip_ctx[col] ? 1 :
1526 (s->above_txfm_ctx[col] * 2 > max_tx);
1528 } else if (have_l) {
1529 c = s->left_skip_ctx[row7] ? 1 :
1530 (s->left_txfm_ctx[row7] * 2 > max_tx);
1536 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1538 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1540 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1542 s->counts.tx32p[c][b->tx]++;
1545 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1547 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1548 s->counts.tx16p[c][b->tx]++;
1551 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1552 s->counts.tx8p[c][b->tx]++;
1559 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1562 if (s->s.h.keyframe || s->s.h.intraonly) {
1563 uint8_t *a = &s->above_mode_ctx[col * 2];
1564 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1567 if (b->bs > BS_8x8) {
1568 // FIXME the memory storage intermediates here aren't really
1569 // necessary, they're just there to make the code slightly
1571 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1572 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1573 if (b->bs != BS_8x4) {
1574 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1575 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1576 l[0] = a[1] = b->mode[1];
1578 l[0] = a[1] = b->mode[1] = b->mode[0];
1580 if (b->bs != BS_4x8) {
1581 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1582 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1583 if (b->bs != BS_8x4) {
1584 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1585 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1586 l[1] = a[1] = b->mode[3];
1588 l[1] = a[1] = b->mode[3] = b->mode[2];
1591 b->mode[2] = b->mode[0];
1592 l[1] = a[1] = b->mode[3] = b->mode[1];
1595 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1596 vp9_default_kf_ymode_probs[*a][*l]);
1597 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1598 // FIXME this can probably be optimized
1599 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1600 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1602 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1603 vp9_default_kf_uvmode_probs[b->mode[3]]);
1604 } else if (b->intra) {
1606 if (b->bs > BS_8x8) {
1607 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1608 s->prob.p.y_mode[0]);
1609 s->counts.y_mode[0][b->mode[0]]++;
1610 if (b->bs != BS_8x4) {
1611 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1612 s->prob.p.y_mode[0]);
1613 s->counts.y_mode[0][b->mode[1]]++;
1615 b->mode[1] = b->mode[0];
1617 if (b->bs != BS_4x8) {
1618 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1619 s->prob.p.y_mode[0]);
1620 s->counts.y_mode[0][b->mode[2]]++;
1621 if (b->bs != BS_8x4) {
1622 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1623 s->prob.p.y_mode[0]);
1624 s->counts.y_mode[0][b->mode[3]]++;
1626 b->mode[3] = b->mode[2];
1629 b->mode[2] = b->mode[0];
1630 b->mode[3] = b->mode[1];
1633 static const uint8_t size_group[10] = {
1634 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1636 int sz = size_group[b->bs];
1638 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1639 s->prob.p.y_mode[sz]);
1640 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1641 s->counts.y_mode[sz][b->mode[3]]++;
1643 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1644 s->prob.p.uv_mode[b->mode[3]]);
1645 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1647 static const uint8_t inter_mode_ctx_lut[14][14] = {
1648 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1649 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1650 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1651 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1652 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1653 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1654 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1655 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1656 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1657 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1658 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1659 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1660 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1661 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1664 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1665 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1667 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1669 // read comp_pred flag
1670 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1671 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1675 // FIXME add intra as ref=0xff (or -1) to make these easier?
1678 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1680 } else if (s->above_comp_ctx[col]) {
1681 c = 2 + (s->left_intra_ctx[row7] ||
1682 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1683 } else if (s->left_comp_ctx[row7]) {
1684 c = 2 + (s->above_intra_ctx[col] ||
1685 s->above_ref_ctx[col] == s->s.h.fixcompref);
1687 c = (!s->above_intra_ctx[col] &&
1688 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1689 (!s->left_intra_ctx[row7] &&
1690 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1693 c = s->above_comp_ctx[col] ? 3 :
1694 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1696 } else if (have_l) {
1697 c = s->left_comp_ctx[row7] ? 3 :
1698 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1702 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1703 s->counts.comp[c][b->comp]++;
1706 // read actual references
1707 // FIXME probably cache a few variables here to prevent repetitive
1708 // memory accesses below
1709 if (b->comp) /* two references */ {
1710 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1712 b->ref[fix_idx] = s->s.h.fixcompref;
1713 // FIXME can this codeblob be replaced by some sort of LUT?
1716 if (s->above_intra_ctx[col]) {
1717 if (s->left_intra_ctx[row7]) {
1720 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1722 } else if (s->left_intra_ctx[row7]) {
1723 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1725 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1727 if (refl == refa && refa == s->s.h.varcompref[1]) {
1729 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1730 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1731 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1734 c = (refa == refl) ? 3 : 1;
1736 } else if (!s->left_comp_ctx[row7]) {
1737 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1740 c = (refl == s->s.h.varcompref[1] &&
1741 refa != s->s.h.varcompref[1]) ? 2 : 4;
1743 } else if (!s->above_comp_ctx[col]) {
1744 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1747 c = (refa == s->s.h.varcompref[1] &&
1748 refl != s->s.h.varcompref[1]) ? 2 : 4;
1751 c = (refl == refa) ? 4 : 2;
1755 if (s->above_intra_ctx[col]) {
1757 } else if (s->above_comp_ctx[col]) {
1758 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1760 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1763 } else if (have_l) {
1764 if (s->left_intra_ctx[row7]) {
1766 } else if (s->left_comp_ctx[row7]) {
1767 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1769 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1774 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1775 b->ref[var_idx] = s->s.h.varcompref[bit];
1776 s->counts.comp_ref[c][bit]++;
1777 } else /* single reference */ {
1780 if (have_a && !s->above_intra_ctx[col]) {
1781 if (have_l && !s->left_intra_ctx[row7]) {
1782 if (s->left_comp_ctx[row7]) {
1783 if (s->above_comp_ctx[col]) {
1784 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1785 !s->above_ref_ctx[col]);
1787 c = (3 * !s->above_ref_ctx[col]) +
1788 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1790 } else if (s->above_comp_ctx[col]) {
1791 c = (3 * !s->left_ref_ctx[row7]) +
1792 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1794 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1796 } else if (s->above_intra_ctx[col]) {
1798 } else if (s->above_comp_ctx[col]) {
1799 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1801 c = 4 * (!s->above_ref_ctx[col]);
1803 } else if (have_l && !s->left_intra_ctx[row7]) {
1804 if (s->left_intra_ctx[row7]) {
1806 } else if (s->left_comp_ctx[row7]) {
1807 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1809 c = 4 * (!s->left_ref_ctx[row7]);
1814 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1815 s->counts.single_ref[c][0][bit]++;
1819 // FIXME can this codeblob be replaced by some sort of LUT?
1822 if (s->left_intra_ctx[row7]) {
1823 if (s->above_intra_ctx[col]) {
1825 } else if (s->above_comp_ctx[col]) {
1826 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1827 s->above_ref_ctx[col] == 1);
1828 } else if (!s->above_ref_ctx[col]) {
1831 c = 4 * (s->above_ref_ctx[col] == 1);
1833 } else if (s->above_intra_ctx[col]) {
1834 if (s->left_intra_ctx[row7]) {
1836 } else if (s->left_comp_ctx[row7]) {
1837 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1838 s->left_ref_ctx[row7] == 1);
1839 } else if (!s->left_ref_ctx[row7]) {
1842 c = 4 * (s->left_ref_ctx[row7] == 1);
1844 } else if (s->above_comp_ctx[col]) {
1845 if (s->left_comp_ctx[row7]) {
1846 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1847 c = 3 * (s->s.h.fixcompref == 1 ||
1848 s->left_ref_ctx[row7] == 1);
1852 } else if (!s->left_ref_ctx[row7]) {
1853 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1854 s->above_ref_ctx[col] == 1);
1856 c = 3 * (s->left_ref_ctx[row7] == 1) +
1857 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1859 } else if (s->left_comp_ctx[row7]) {
1860 if (!s->above_ref_ctx[col]) {
1861 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1862 s->left_ref_ctx[row7] == 1);
1864 c = 3 * (s->above_ref_ctx[col] == 1) +
1865 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1867 } else if (!s->above_ref_ctx[col]) {
1868 if (!s->left_ref_ctx[row7]) {
1871 c = 4 * (s->left_ref_ctx[row7] == 1);
1873 } else if (!s->left_ref_ctx[row7]) {
1874 c = 4 * (s->above_ref_ctx[col] == 1);
1876 c = 2 * (s->left_ref_ctx[row7] == 1) +
1877 2 * (s->above_ref_ctx[col] == 1);
1880 if (s->above_intra_ctx[col] ||
1881 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1883 } else if (s->above_comp_ctx[col]) {
1884 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1886 c = 4 * (s->above_ref_ctx[col] == 1);
1889 } else if (have_l) {
1890 if (s->left_intra_ctx[row7] ||
1891 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1893 } else if (s->left_comp_ctx[row7]) {
1894 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1896 c = 4 * (s->left_ref_ctx[row7] == 1);
1901 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1902 s->counts.single_ref[c][1][bit]++;
1903 b->ref[0] = 1 + bit;
1908 if (b->bs <= BS_8x8) {
1909 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1910 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1912 static const uint8_t off[10] = {
1913 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1916 // FIXME this needs to use the LUT tables from find_ref_mvs
1917 // because not all are -1,0/0,-1
1918 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1919 [s->left_mode_ctx[row7 + off[b->bs]]];
1921 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1922 s->prob.p.mv_mode[c]);
1923 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1924 s->counts.mv_mode[c][b->mode[0] - 10]++;
1928 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1931 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1932 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1933 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1934 s->left_filter_ctx[row7] : 3;
1936 c = s->above_filter_ctx[col];
1938 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1939 c = s->left_filter_ctx[row7];
1944 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1945 s->prob.p.filter[c]);
1946 s->counts.filter[c][filter_id]++;
1947 b->filter = vp9_filter_lut[filter_id];
1949 b->filter = s->s.h.filtermode;
1952 if (b->bs > BS_8x8) {
1953 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1955 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1956 s->prob.p.mv_mode[c]);
1957 s->counts.mv_mode[c][b->mode[0] - 10]++;
1958 fill_mv(s, b->mv[0], b->mode[0], 0);
1960 if (b->bs != BS_8x4) {
1961 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1962 s->prob.p.mv_mode[c]);
1963 s->counts.mv_mode[c][b->mode[1] - 10]++;
1964 fill_mv(s, b->mv[1], b->mode[1], 1);
1966 b->mode[1] = b->mode[0];
1967 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1968 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1971 if (b->bs != BS_4x8) {
1972 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1973 s->prob.p.mv_mode[c]);
1974 s->counts.mv_mode[c][b->mode[2] - 10]++;
1975 fill_mv(s, b->mv[2], b->mode[2], 2);
1977 if (b->bs != BS_8x4) {
1978 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1979 s->prob.p.mv_mode[c]);
1980 s->counts.mv_mode[c][b->mode[3] - 10]++;
1981 fill_mv(s, b->mv[3], b->mode[3], 3);
1983 b->mode[3] = b->mode[2];
1984 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1985 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1988 b->mode[2] = b->mode[0];
1989 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1990 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1991 b->mode[3] = b->mode[1];
1992 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1993 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1996 fill_mv(s, b->mv[0], b->mode[0], -1);
1997 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1998 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1999 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2000 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2001 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2002 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2005 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2009 #define SPLAT_CTX(var, val, n) \
2011 case 1: var = val; break; \
2012 case 2: AV_WN16A(&var, val * 0x0101); break; \
2013 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2014 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2016 uint64_t v64 = val * 0x0101010101010101ULL; \
2017 AV_WN64A( &var, v64); \
2018 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2023 #define SPLAT_CTX(var, val, n) \
2025 case 1: var = val; break; \
2026 case 2: AV_WN16A(&var, val * 0x0101); break; \
2027 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2029 uint32_t v32 = val * 0x01010101; \
2030 AV_WN32A( &var, v32); \
2031 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2035 uint32_t v32 = val * 0x01010101; \
2036 AV_WN32A( &var, v32); \
2037 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2038 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2039 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2045 switch (bwh_tab[1][b->bs][0]) {
2046 #define SET_CTXS(dir, off, n) \
2048 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2049 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2050 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2051 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2052 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2053 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2054 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2056 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2057 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2058 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2063 case 1: SET_CTXS(above, col, 1); break;
2064 case 2: SET_CTXS(above, col, 2); break;
2065 case 4: SET_CTXS(above, col, 4); break;
2066 case 8: SET_CTXS(above, col, 8); break;
2068 switch (bwh_tab[1][b->bs][1]) {
2069 case 1: SET_CTXS(left, row7, 1); break;
2070 case 2: SET_CTXS(left, row7, 2); break;
2071 case 4: SET_CTXS(left, row7, 4); break;
2072 case 8: SET_CTXS(left, row7, 8); break;
2077 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2078 if (b->bs > BS_8x8) {
2079 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2081 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2082 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2083 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2084 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2085 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2086 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2087 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2088 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2090 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2092 for (n = 0; n < w4 * 2; n++) {
2093 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2094 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2096 for (n = 0; n < h4 * 2; n++) {
2097 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2098 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2104 for (y = 0; y < h4; y++) {
2105 int x, o = (row + y) * s->sb_cols * 8 + col;
2106 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2109 for (x = 0; x < w4; x++) {
2113 } else if (b->comp) {
2114 for (x = 0; x < w4; x++) {
2115 mv[x].ref[0] = b->ref[0];
2116 mv[x].ref[1] = b->ref[1];
2117 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2118 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2121 for (x = 0; x < w4; x++) {
2122 mv[x].ref[0] = b->ref[0];
2124 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2130 // FIXME merge cnt/eob arguments?
2131 static av_always_inline int
2132 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2133 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2134 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2135 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2136 const int16_t *band_counts, const int16_t *qmul)
2138 int i = 0, band = 0, band_left = band_counts[band];
2139 uint8_t *tp = p[0][nnz];
2140 uint8_t cache[1024];
2145 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2146 eob[band][nnz][val]++;
2151 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2152 cnt[band][nnz][0]++;
2154 band_left = band_counts[++band];
2156 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2158 if (++i == n_coeffs)
2159 break; //invalid input; blocks should end with EOB
2164 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2165 cnt[band][nnz][1]++;
2169 // fill in p[3-10] (model fill) - only once per frame for each pos
2171 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2173 cnt[band][nnz][2]++;
2174 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2175 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2176 cache[rc] = val = 2;
2178 val = 3 + vp56_rac_get_prob(c, tp[5]);
2181 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2183 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2184 val = 5 + vp56_rac_get_prob(c, 159);
2186 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2187 val += vp56_rac_get_prob(c, 145);
2191 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2192 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2193 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2194 val += (vp56_rac_get_prob(c, 148) << 1);
2195 val += vp56_rac_get_prob(c, 140);
2197 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2198 val += (vp56_rac_get_prob(c, 155) << 2);
2199 val += (vp56_rac_get_prob(c, 140) << 1);
2200 val += vp56_rac_get_prob(c, 135);
2202 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2203 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2204 val += (vp56_rac_get_prob(c, 157) << 3);
2205 val += (vp56_rac_get_prob(c, 141) << 2);
2206 val += (vp56_rac_get_prob(c, 134) << 1);
2207 val += vp56_rac_get_prob(c, 130);
2210 if (!is8bitsperpixel) {
2212 val += vp56_rac_get_prob(c, 255) << 17;
2213 val += vp56_rac_get_prob(c, 255) << 16;
2215 val += (vp56_rac_get_prob(c, 255) << 15);
2216 val += (vp56_rac_get_prob(c, 255) << 14);
2218 val += (vp56_rac_get_prob(c, 254) << 13);
2219 val += (vp56_rac_get_prob(c, 254) << 12);
2220 val += (vp56_rac_get_prob(c, 254) << 11);
2221 val += (vp56_rac_get_prob(c, 252) << 10);
2222 val += (vp56_rac_get_prob(c, 249) << 9);
2223 val += (vp56_rac_get_prob(c, 243) << 8);
2224 val += (vp56_rac_get_prob(c, 230) << 7);
2225 val += (vp56_rac_get_prob(c, 196) << 6);
2226 val += (vp56_rac_get_prob(c, 177) << 5);
2227 val += (vp56_rac_get_prob(c, 153) << 4);
2228 val += (vp56_rac_get_prob(c, 140) << 3);
2229 val += (vp56_rac_get_prob(c, 133) << 2);
2230 val += (vp56_rac_get_prob(c, 130) << 1);
2231 val += vp56_rac_get_prob(c, 129);
2235 #define STORE_COEF(c, i, v) do { \
2236 if (is8bitsperpixel) { \
2239 AV_WN32A(&c[i * 2], v); \
2243 band_left = band_counts[++band];
2245 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2247 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2248 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2250 } while (++i < n_coeffs);
2255 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2256 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2257 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2258 const int16_t (*nb)[2], const int16_t *band_counts,
2259 const int16_t *qmul)
2261 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2262 nnz, scan, nb, band_counts, qmul);
2265 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2266 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2267 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2268 const int16_t (*nb)[2], const int16_t *band_counts,
2269 const int16_t *qmul)
2271 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2272 nnz, scan, nb, band_counts, qmul);
2275 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2276 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2277 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2278 const int16_t (*nb)[2], const int16_t *band_counts,
2279 const int16_t *qmul)
2281 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2282 nnz, scan, nb, band_counts, qmul);
2285 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2286 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2287 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2288 const int16_t (*nb)[2], const int16_t *band_counts,
2289 const int16_t *qmul)
2291 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2292 nnz, scan, nb, band_counts, qmul);
2295 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2297 VP9Context *s = ctx->priv_data;
2299 int row = s->row, col = s->col;
2300 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2301 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2302 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2303 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2304 int end_x = FFMIN(2 * (s->cols - col), w4);
2305 int end_y = FFMIN(2 * (s->rows - row), h4);
2306 int n, pl, x, y, res;
2307 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2308 int tx = 4 * s->s.h.lossless + b->tx;
2309 const int16_t * const *yscans = vp9_scans[tx];
2310 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2311 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2312 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2313 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2314 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2315 static const int16_t band_counts[4][8] = {
2316 { 1, 2, 3, 4, 3, 16 - 13 },
2317 { 1, 2, 3, 4, 11, 64 - 21 },
2318 { 1, 2, 3, 4, 11, 256 - 21 },
2319 { 1, 2, 3, 4, 11, 1024 - 21 },
2321 const int16_t *y_band_counts = band_counts[b->tx];
2322 const int16_t *uv_band_counts = band_counts[b->uvtx];
2323 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2324 int total_coeff = 0;
2326 #define MERGE(la, end, step, rd) \
2327 for (n = 0; n < end; n += step) \
2328 la[n] = !!rd(&la[n])
2329 #define MERGE_CTX(step, rd) \
2331 MERGE(l, end_y, step, rd); \
2332 MERGE(a, end_x, step, rd); \
2335 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2336 for (n = 0, y = 0; y < end_y; y += step) { \
2337 for (x = 0; x < end_x; x += step, n += step * step) { \
2338 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2339 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2340 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2341 c, e, p, a[x] + l[y], yscans[txtp], \
2342 ynbs[txtp], y_band_counts, qmul[0]); \
2343 a[x] = l[y] = !!res; \
2344 total_coeff |= !!res; \
2346 AV_WN16A(&s->eob[n], res); \
2353 #define SPLAT(la, end, step, cond) \
2355 for (n = 1; n < end; n += step) \
2356 la[n] = la[n - 1]; \
2357 } else if (step == 4) { \
2359 for (n = 0; n < end; n += step) \
2360 AV_WN32A(&la[n], la[n] * 0x01010101); \
2362 for (n = 0; n < end; n += step) \
2363 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2365 } else /* step == 8 */ { \
2367 if (HAVE_FAST_64BIT) { \
2368 for (n = 0; n < end; n += step) \
2369 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2371 for (n = 0; n < end; n += step) { \
2372 uint32_t v32 = la[n] * 0x01010101; \
2373 AV_WN32A(&la[n], v32); \
2374 AV_WN32A(&la[n + 4], v32); \
2378 for (n = 0; n < end; n += step) \
2379 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2382 #define SPLAT_CTX(step) \
2384 SPLAT(a, end_x, step, end_x == w4); \
2385 SPLAT(l, end_y, step, end_y == h4); \
2391 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2394 MERGE_CTX(2, AV_RN16A);
2395 DECODE_Y_COEF_LOOP(2, 0,);
2399 MERGE_CTX(4, AV_RN32A);
2400 DECODE_Y_COEF_LOOP(4, 0,);
2404 MERGE_CTX(8, AV_RN64A);
2405 DECODE_Y_COEF_LOOP(8, 0, 32);
2410 #define DECODE_UV_COEF_LOOP(step, v) \
2411 for (n = 0, y = 0; y < end_y; y += step) { \
2412 for (x = 0; x < end_x; x += step, n += step * step) { \
2413 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2414 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2415 16 * step * step, c, e, p, a[x] + l[y], \
2416 uvscan, uvnb, uv_band_counts, qmul[1]); \
2417 a[x] = l[y] = !!res; \
2418 total_coeff |= !!res; \
2420 AV_WN16A(&s->uveob[pl][n], res); \
2422 s->uveob[pl][n] = res; \
2427 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2428 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2429 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2434 for (pl = 0; pl < 2; pl++) {
2435 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2436 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2439 DECODE_UV_COEF_LOOP(1,);
2442 MERGE_CTX(2, AV_RN16A);
2443 DECODE_UV_COEF_LOOP(2,);
2447 MERGE_CTX(4, AV_RN32A);
2448 DECODE_UV_COEF_LOOP(4,);
2452 MERGE_CTX(8, AV_RN64A);
2453 DECODE_UV_COEF_LOOP(8, 32);
2462 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2464 return decode_coeffs(ctx, 1);
2467 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2469 return decode_coeffs(ctx, 0);
2472 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2473 uint8_t *dst_edge, ptrdiff_t stride_edge,
2474 uint8_t *dst_inner, ptrdiff_t stride_inner,
2475 uint8_t *l, int col, int x, int w,
2476 int row, int y, enum TxfmMode tx,
2477 int p, int ss_h, int ss_v, int bytesperpixel)
2479 int have_top = row > 0 || y > 0;
2480 int have_left = col > s->tile_col_start || x > 0;
2481 int have_right = x < w - 1;
2483 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2484 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2485 { DC_127_PRED, VERT_PRED } },
2486 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2487 { HOR_PRED, HOR_PRED } },
2488 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2489 { LEFT_DC_PRED, DC_PRED } },
2490 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2491 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2492 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2493 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2494 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2495 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2496 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2497 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2498 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2499 { DC_127_PRED, VERT_LEFT_PRED } },
2500 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2501 { HOR_UP_PRED, HOR_UP_PRED } },
2502 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2503 { HOR_PRED, TM_VP8_PRED } },
2505 static const struct {
2506 uint8_t needs_left:1;
2507 uint8_t needs_top:1;
2508 uint8_t needs_topleft:1;
2509 uint8_t needs_topright:1;
2510 uint8_t invert_left:1;
2511 } edges[N_INTRA_PRED_MODES] = {
2512 [VERT_PRED] = { .needs_top = 1 },
2513 [HOR_PRED] = { .needs_left = 1 },
2514 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2515 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2516 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2517 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2518 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2519 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2520 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2521 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2522 [LEFT_DC_PRED] = { .needs_left = 1 },
2523 [TOP_DC_PRED] = { .needs_top = 1 },
2524 [DC_128_PRED] = { 0 },
2525 [DC_127_PRED] = { 0 },
2526 [DC_129_PRED] = { 0 }
2529 av_assert2(mode >= 0 && mode < 10);
2530 mode = mode_conv[mode][have_left][have_top];
2531 if (edges[mode].needs_top) {
2532 uint8_t *top, *topleft;
2533 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2534 int n_px_need_tr = 0;
2536 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2539 // if top of sb64-row, use s->intra_pred_data[] instead of
2540 // dst[-stride] for intra prediction (it contains pre- instead of
2541 // post-loopfilter data)
2543 top = !(row & 7) && !y ?
2544 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2545 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2547 topleft = !(row & 7) && !y ?
2548 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2549 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2550 &dst_inner[-stride_inner];
2554 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2555 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2556 n_px_need + n_px_need_tr <= n_px_have) {
2560 if (n_px_need <= n_px_have) {
2561 memcpy(*a, top, n_px_need * bytesperpixel);
2563 #define memset_bpp(c, i1, v, i2, num) do { \
2564 if (bytesperpixel == 1) { \
2565 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2567 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2568 for (n = 0; n < (num); n++) { \
2569 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2573 memcpy(*a, top, n_px_have * bytesperpixel);
2574 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2577 #define memset_val(c, val, num) do { \
2578 if (bytesperpixel == 1) { \
2579 memset((c), (val), (num)); \
2582 for (n = 0; n < (num); n++) { \
2583 AV_WN16A(&(c)[n * 2], (val)); \
2587 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2589 if (edges[mode].needs_topleft) {
2590 if (have_left && have_top) {
2591 #define assign_bpp(c, i1, v, i2) do { \
2592 if (bytesperpixel == 1) { \
2593 (c)[(i1)] = (v)[(i2)]; \
2595 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2598 assign_bpp(*a, -1, topleft, -1);
2600 #define assign_val(c, i, v) do { \
2601 if (bytesperpixel == 1) { \
2604 AV_WN16A(&(c)[(i) * 2], (v)); \
2607 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2610 if (tx == TX_4X4 && edges[mode].needs_topright) {
2611 if (have_top && have_right &&
2612 n_px_need + n_px_need_tr <= n_px_have) {
2613 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2615 memset_bpp(*a, 4, *a, 3, 4);
2620 if (edges[mode].needs_left) {
2622 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2623 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2624 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2626 if (edges[mode].invert_left) {
2627 if (n_px_need <= n_px_have) {
2628 for (i = 0; i < n_px_need; i++)
2629 assign_bpp(l, i, &dst[i * stride], -1);
2631 for (i = 0; i < n_px_have; i++)
2632 assign_bpp(l, i, &dst[i * stride], -1);
2633 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2636 if (n_px_need <= n_px_have) {
2637 for (i = 0; i < n_px_need; i++)
2638 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2640 for (i = 0; i < n_px_have; i++)
2641 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2642 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2646 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2653 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2654 ptrdiff_t uv_off, int bytesperpixel)
2656 VP9Context *s = ctx->priv_data;
2658 int row = s->row, col = s->col;
2659 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2660 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2661 int end_x = FFMIN(2 * (s->cols - col), w4);
2662 int end_y = FFMIN(2 * (s->rows - row), h4);
2663 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2664 int uvstep1d = 1 << b->uvtx, p;
2665 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2666 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2667 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2669 for (n = 0, y = 0; y < end_y; y += step1d) {
2670 uint8_t *ptr = dst, *ptr_r = dst_r;
2671 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2672 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2673 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2675 uint8_t *a = &a_buf[32];
2676 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2677 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2679 mode = check_intra_mode(s, mode, &a, ptr_r,
2680 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2681 ptr, s->y_stride, l,
2682 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2683 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2685 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2686 s->block + 16 * n * bytesperpixel, eob);
2688 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2689 dst += 4 * step1d * s->y_stride;
2696 step = 1 << (b->uvtx * 2);
2697 for (p = 0; p < 2; p++) {
2698 dst = s->dst[1 + p];
2699 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2700 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2701 uint8_t *ptr = dst, *ptr_r = dst_r;
2702 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2703 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2704 int mode = b->uvmode;
2705 uint8_t *a = &a_buf[32];
2706 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2708 mode = check_intra_mode(s, mode, &a, ptr_r,
2709 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2710 ptr, s->uv_stride, l, col, x, w4, row, y,
2711 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2712 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2714 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2715 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2717 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2718 dst += 4 * uvstep1d * s->uv_stride;
2723 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2725 intra_recon(ctx, y_off, uv_off, 1);
2728 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2730 intra_recon(ctx, y_off, uv_off, 2);
2733 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2734 uint8_t *dst, ptrdiff_t dst_stride,
2735 const uint8_t *ref, ptrdiff_t ref_stride,
2736 ThreadFrame *ref_frame,
2737 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2738 int bw, int bh, int w, int h, int bytesperpixel)
2740 int mx = mv->x, my = mv->y, th;
2744 ref += y * ref_stride + x * bytesperpixel;
2747 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2748 // we use +7 because the last 7 pixels of each sbrow can be changed in
2749 // the longest loopfilter of the next sbrow
2750 th = (y + bh + 4 * !!my + 7) >> 6;
2751 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2752 // The arm/aarch64 _hv filters read one more row than what actually is
2753 // needed, so switch to emulated edge one pixel sooner vertically
2754 // (!!my * 5) than horizontally (!!mx * 4).
2755 if (x < !!mx * 3 || y < !!my * 3 ||
2756 x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
2757 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2758 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2760 bw + !!mx * 7, bh + !!my * 7,
2761 x - !!mx * 3, y - !!my * 3, w, h);
2762 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2765 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2768 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2769 uint8_t *dst_u, uint8_t *dst_v,
2770 ptrdiff_t dst_stride,
2771 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2772 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2773 ThreadFrame *ref_frame,
2774 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2775 int bw, int bh, int w, int h, int bytesperpixel)
2777 int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
2781 ref_u += y * src_stride_u + x * bytesperpixel;
2782 ref_v += y * src_stride_v + x * bytesperpixel;
2785 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2786 // we use +7 because the last 7 pixels of each sbrow can be changed in
2787 // the longest loopfilter of the next sbrow
2788 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2789 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2790 // The arm/aarch64 _hv filters read one more row than what actually is
2791 // needed, so switch to emulated edge one pixel sooner vertically
2792 // (!!my * 5) than horizontally (!!mx * 4).
2793 if (x < !!mx * 3 || y < !!my * 3 ||
2794 x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
2795 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2796 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2798 bw + !!mx * 7, bh + !!my * 7,
2799 x - !!mx * 3, y - !!my * 3, w, h);
2800 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2801 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2803 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2804 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2806 bw + !!mx * 7, bh + !!my * 7,
2807 x - !!mx * 3, y - !!my * 3, w, h);
2808 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2809 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2811 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2812 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2816 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2817 px, py, pw, ph, bw, bh, w, h, i) \
2818 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2819 mv, bw, bh, w, h, bytesperpixel)
2820 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2821 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2822 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2823 row, col, mv, bw, bh, w, h, bytesperpixel)
2825 #define FN(x) x##_8bpp
2826 #define BYTES_PER_PIXEL 1
2827 #include "vp9_mc_template.c"
2829 #undef BYTES_PER_PIXEL
2830 #define FN(x) x##_16bpp
2831 #define BYTES_PER_PIXEL 2
2832 #include "vp9_mc_template.c"
2834 #undef mc_chroma_dir
2836 #undef BYTES_PER_PIXEL
2839 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2840 vp9_mc_func (*mc)[2],
2841 uint8_t *dst, ptrdiff_t dst_stride,
2842 const uint8_t *ref, ptrdiff_t ref_stride,
2843 ThreadFrame *ref_frame,
2844 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2845 int px, int py, int pw, int ph,
2846 int bw, int bh, int w, int h, int bytesperpixel,
2847 const uint16_t *scale, const uint8_t *step)
2849 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2850 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2851 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2852 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2854 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2856 int refbw_m1, refbh_m1;
2860 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2861 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2862 // BUG libvpx seems to scale the two components separately. This introduces
2863 // rounding errors but we have to reproduce them to be exactly compatible
2864 // with the output from libvpx...
2865 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2866 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2870 ref += y * ref_stride + x * bytesperpixel;
2873 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2874 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2875 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2876 // we use +7 because the last 7 pixels of each sbrow can be changed in
2877 // the longest loopfilter of the next sbrow
2878 th = (y + refbh_m1 + 4 + 7) >> 6;
2879 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2880 // The arm/aarch64 _hv filters read one more row than what actually is
2881 // needed, so switch to emulated edge one pixel sooner vertically
2882 // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
2883 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
2884 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2885 ref - 3 * ref_stride - 3 * bytesperpixel,
2887 refbw_m1 + 8, refbh_m1 + 8,
2888 x - 3, y - 3, w, h);
2889 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2892 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2896 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2897 vp9_mc_func (*mc)[2],
2898 uint8_t *dst_u, uint8_t *dst_v,
2899 ptrdiff_t dst_stride,
2900 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2901 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2902 ThreadFrame *ref_frame,
2903 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2904 int px, int py, int pw, int ph,
2905 int bw, int bh, int w, int h, int bytesperpixel,
2906 const uint16_t *scale, const uint8_t *step)
2908 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2909 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2910 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2911 ref_v, src_stride_v, ref_frame,
2912 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2915 int refbw_m1, refbh_m1;
2920 // BUG https://code.google.com/p/webm/issues/detail?id=820
2921 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
2922 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2924 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2925 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2928 // BUG https://code.google.com/p/webm/issues/detail?id=820
2929 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
2930 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2932 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2933 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2938 ref_u += y * src_stride_u + x * bytesperpixel;
2939 ref_v += y * src_stride_v + x * bytesperpixel;
2942 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2943 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2944 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2945 // we use +7 because the last 7 pixels of each sbrow can be changed in
2946 // the longest loopfilter of the next sbrow
2947 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2948 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2949 // The arm/aarch64 _hv filters read one more row than what actually is
2950 // needed, so switch to emulated edge one pixel sooner vertically
2951 // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
2952 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
2953 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2954 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2956 refbw_m1 + 8, refbh_m1 + 8,
2957 x - 3, y - 3, w, h);
2958 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2959 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2961 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2962 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2964 refbw_m1 + 8, refbh_m1 + 8,
2965 x - 3, y - 3, w, h);
2966 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2967 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2969 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2970 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2975 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2976 px, py, pw, ph, bw, bh, w, h, i) \
2977 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2978 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2979 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2980 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2981 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2982 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2983 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2984 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2986 #define FN(x) x##_scaled_8bpp
2987 #define BYTES_PER_PIXEL 1
2988 #include "vp9_mc_template.c"
2990 #undef BYTES_PER_PIXEL
2991 #define FN(x) x##_scaled_16bpp
2992 #define BYTES_PER_PIXEL 2
2993 #include "vp9_mc_template.c"
2995 #undef mc_chroma_dir
2997 #undef BYTES_PER_PIXEL
3000 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3002 VP9Context *s = ctx->priv_data;
3004 int row = s->row, col = s->col;
3006 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3007 if (bytesperpixel == 1) {
3008 inter_pred_scaled_8bpp(ctx);
3010 inter_pred_scaled_16bpp(ctx);
3013 if (bytesperpixel == 1) {
3014 inter_pred_8bpp(ctx);
3016 inter_pred_16bpp(ctx);
3020 /* mostly copied intra_recon() */
3022 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3023 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3024 int end_x = FFMIN(2 * (s->cols - col), w4);
3025 int end_y = FFMIN(2 * (s->rows - row), h4);
3026 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3027 int uvstep1d = 1 << b->uvtx, p;
3028 uint8_t *dst = s->dst[0];
3031 for (n = 0, y = 0; y < end_y; y += step1d) {
3033 for (x = 0; x < end_x; x += step1d,
3034 ptr += 4 * step1d * bytesperpixel, n += step) {
3035 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3038 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3039 s->block + 16 * n * bytesperpixel, eob);
3041 dst += 4 * s->y_stride * step1d;
3047 step = 1 << (b->uvtx * 2);
3048 for (p = 0; p < 2; p++) {
3049 dst = s->dst[p + 1];
3050 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3052 for (x = 0; x < end_x; x += uvstep1d,
3053 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3054 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3057 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3058 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3060 dst += 4 * uvstep1d * s->uv_stride;
3066 static void inter_recon_8bpp(AVCodecContext *ctx)
3068 inter_recon(ctx, 1);
3071 static void inter_recon_16bpp(AVCodecContext *ctx)
3073 inter_recon(ctx, 2);
3076 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3077 int row_and_7, int col_and_7,
3078 int w, int h, int col_end, int row_end,
3079 enum TxfmMode tx, int skip_inter)
3081 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3082 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3084 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3085 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3086 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3087 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3089 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3090 // edges. This means that for UV, we work on two subsampled blocks at
3091 // a time, and we only use the topleft block's mode information to set
3092 // things like block strength. Thus, for any block size smaller than
3093 // 16x16, ignore the odd portion of the block.
3094 if (tx == TX_4X4 && (ss_v | ss_h)) {
3109 if (tx == TX_4X4 && !skip_inter) {
3110 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3111 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3112 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3114 for (y = row_and_7; y < h + row_and_7; y++) {
3115 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3117 mask[0][y][1] |= m_row_8;
3118 mask[0][y][2] |= m_row_4;
3119 // for odd lines, if the odd col is not being filtered,
3120 // skip odd row also:
3127 // if a/c are even row/col and b/d are odd, and d is skipped,
3128 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3129 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3130 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3132 mask[1][y][col_mask_id] |= m_col;
3135 mask[0][y][3] |= m_col;
3137 if (ss_h && (col_end & 1))
3138 mask[1][y][3] |= (t << (w - 1)) - t;
3140 mask[1][y][3] |= m_col;
3144 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3147 int mask_id = (tx == TX_8X8);
3148 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3149 int l2 = tx + ss_h - 1, step1d;
3150 int m_row = m_col & masks[l2];
3152 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3153 // 8wd loopfilter to prevent going off the visible edge.
3154 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3155 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3156 int m_row_8 = m_row - m_row_16;
3158 for (y = row_and_7; y < h + row_and_7; y++) {
3159 mask[0][y][0] |= m_row_16;
3160 mask[0][y][1] |= m_row_8;
3163 for (y = row_and_7; y < h + row_and_7; y++)
3164 mask[0][y][mask_id] |= m_row;
3169 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3170 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3171 mask[1][y][0] |= m_col;
3172 if (y - row_and_7 == h - 1)
3173 mask[1][y][1] |= m_col;
3175 for (y = row_and_7; y < h + row_and_7; y += step1d)
3176 mask[1][y][mask_id] |= m_col;
3178 } else if (tx != TX_4X4) {
3181 mask_id = (tx == TX_8X8) || (h == ss_v);
3182 mask[1][row_and_7][mask_id] |= m_col;
3183 mask_id = (tx == TX_8X8) || (w == ss_h);
3184 for (y = row_and_7; y < h + row_and_7; y++)
3185 mask[0][y][mask_id] |= t;
3187 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3189 for (y = row_and_7; y < h + row_and_7; y++) {
3190 mask[0][y][2] |= t4;
3191 mask[0][y][1] |= t8;
3193 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3198 static void decode_b(AVCodecContext *ctx, int row, int col,
3199 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3200 enum BlockLevel bl, enum BlockPartition bp)
3202 VP9Context *s = ctx->priv_data;
3204 enum BlockSize bs = bl * 3 + bp;
3205 int bytesperpixel = s->bytesperpixel;
3206 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3208 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3214 s->min_mv.x = -(128 + col * 64);
3215 s->min_mv.y = -(128 + row * 64);
3216 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3217 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3223 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3224 (s->ss_v && h4 * 2 == (1 << b->tx)));
3229 if (bytesperpixel == 1) {
3230 has_coeffs = decode_coeffs_8bpp(ctx);
3232 has_coeffs = decode_coeffs_16bpp(ctx);
3234 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3236 memset(&s->above_skip_ctx[col], 1, w4);
3237 memset(&s->left_skip_ctx[s->row7], 1, h4);
3242 #define SPLAT_ZERO_CTX(v, n) \
3244 case 1: v = 0; break; \
3245 case 2: AV_ZERO16(&v); break; \
3246 case 4: AV_ZERO32(&v); break; \
3247 case 8: AV_ZERO64(&v); break; \
3248 case 16: AV_ZERO128(&v); break; \
3250 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3252 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3253 if (s->ss_##dir2) { \
3254 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3255 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3257 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3258 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3263 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3264 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3265 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3266 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3269 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3270 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3271 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3272 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3278 s->block += w4 * h4 * 64 * bytesperpixel;
3279 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3280 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3281 s->eob += 4 * w4 * h4;
3282 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3283 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3289 // emulated overhangs if the stride of the target buffer can't hold. This
3290 // makes it possible to support emu-edge and so on even if we have large block
3292 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3293 (row + h4) > s->rows;
3294 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3295 (row + h4) > s->rows;
3297 s->dst[0] = s->tmp_y;
3300 s->dst[0] = f->data[0] + yoff;
3301 s->y_stride = f->linesize[0];
3304 s->dst[1] = s->tmp_uv[0];
3305 s->dst[2] = s->tmp_uv[1];
3308 s->dst[1] = f->data[1] + uvoff;
3309 s->dst[2] = f->data[2] + uvoff;
3310 s->uv_stride = f->linesize[1];
3314 intra_recon_16bpp(ctx, yoff, uvoff);
3316 intra_recon_8bpp(ctx, yoff, uvoff);
3320 inter_recon_16bpp(ctx);
3322 inter_recon_8bpp(ctx);
3326 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3328 for (n = 0; o < w; n++) {
3333 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3334 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3340 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3341 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3343 for (n = s->ss_h; o < w; n++) {
3348 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3349 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3350 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3351 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3357 // pick filter level and find edges to apply filter to
3358 if (s->s.h.filter.level &&
3359 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3360 [b->mode[3] != ZEROMV]) > 0) {
3361 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3362 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3364 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3365 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3366 if (s->ss_h || s->ss_v)
3367 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3368 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3369 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3370 b->uvtx, skip_inter);
3372 if (!s->filter_lut.lim_lut[lvl]) {
3373 int sharp = s->s.h.filter.sharpness;
3377 limit >>= (sharp + 3) >> 2;
3378 limit = FFMIN(limit, 9 - sharp);
3380 limit = FFMAX(limit, 1);
3382 s->filter_lut.lim_lut[lvl] = limit;
3383 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3389 s->block += w4 * h4 * 64 * bytesperpixel;
3390 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3391 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3392 s->eob += 4 * w4 * h4;
3393 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3394 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3398 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3399 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3401 VP9Context *s = ctx->priv_data;
3402 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3403 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3404 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3405 s->prob.p.partition[bl][c];
3406 enum BlockPartition bp;
3407 ptrdiff_t hbs = 4 >> bl;
3408 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3409 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3410 int bytesperpixel = s->bytesperpixel;
3413 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3414 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3415 } else if (col + hbs < s->cols) { // FIXME why not <=?
3416 if (row + hbs < s->rows) { // FIXME why not <=?
3417 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3419 case PARTITION_NONE:
3420 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3423 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3424 yoff += hbs * 8 * y_stride;
3425 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3426 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3429 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3430 yoff += hbs * 8 * bytesperpixel;
3431 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3432 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3434 case PARTITION_SPLIT:
3435 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3436 decode_sb(ctx, row, col + hbs, lflvl,
3437 yoff + 8 * hbs * bytesperpixel,
3438 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3439 yoff += hbs * 8 * y_stride;
3440 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3441 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3442 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3443 yoff + 8 * hbs * bytesperpixel,
3444 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3449 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3450 bp = PARTITION_SPLIT;
3451 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3452 decode_sb(ctx, row, col + hbs, lflvl,
3453 yoff + 8 * hbs * bytesperpixel,
3454 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3457 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3459 } else if (row + hbs < s->rows) { // FIXME why not <=?
3460 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3461 bp = PARTITION_SPLIT;
3462 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3463 yoff += hbs * 8 * y_stride;
3464 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3465 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3468 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3471 bp = PARTITION_SPLIT;
3472 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3474 s->counts.partition[bl][c][bp]++;
3477 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3478 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3480 VP9Context *s = ctx->priv_data;
3482 ptrdiff_t hbs = 4 >> bl;
3483 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3484 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3485 int bytesperpixel = s->bytesperpixel;
3488 av_assert2(b->bl == BL_8X8);
3489 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3490 } else if (s->b->bl == bl) {
3491 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3492 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3493 yoff += hbs * 8 * y_stride;
3494 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3495 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3496 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3497 yoff += hbs * 8 * bytesperpixel;
3498 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3499 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3502 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3503 if (col + hbs < s->cols) { // FIXME why not <=?
3504 if (row + hbs < s->rows) {
3505 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3506 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3507 yoff += hbs * 8 * y_stride;
3508 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3509 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3510 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3511 yoff + 8 * hbs * bytesperpixel,
3512 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3514 yoff += hbs * 8 * bytesperpixel;
3515 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3516 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3518 } else if (row + hbs < s->rows) {
3519 yoff += hbs * 8 * y_stride;
3520 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3521 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3526 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3527 uint8_t *lvl, uint8_t (*mask)[4],
3528 uint8_t *dst, ptrdiff_t ls)
3530 int y, x, bytesperpixel = s->bytesperpixel;
3532 // filter edges between columns (e.g. block1 | block2)
3533 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3534 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3535 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3536 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3537 unsigned hm = hm1 | hm2 | hm13 | hm23;
3539 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3542 int L = *l, H = L >> 4;
3543 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3545 if (hmask1[0] & x) {
3546 if (hmask2[0] & x) {
3547 av_assert2(l[8 << ss_v] == L);
3548 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3550 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3552 } else if (hm2 & x) {
3555 E |= s->filter_lut.mblim_lut[L] << 8;
3556 I |= s->filter_lut.lim_lut[L] << 8;
3557 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3559 [0](ptr, ls, E, I, H);
3561 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3562 [0](ptr, ls, E, I, H);
3564 } else if (hm2 & x) {
3565 int L = l[8 << ss_v], H = L >> 4;
3566 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3568 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3569 [0](ptr + 8 * ls, ls, E, I, H);
3577 int L = *l, H = L >> 4;
3578 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3583 E |= s->filter_lut.mblim_lut[L] << 8;
3584 I |= s->filter_lut.lim_lut[L] << 8;
3585 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3587 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3589 } else if (hm23 & x) {
3590 int L = l[8 << ss_v], H = L >> 4;
3591 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3593 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3601 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3602 uint8_t *lvl, uint8_t (*mask)[4],
3603 uint8_t *dst, ptrdiff_t ls)
3605 int y, x, bytesperpixel = s->bytesperpixel;
3608 // filter edges between rows (e.g. ------)
3610 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3611 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3612 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3614 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3617 int L = *l, H = L >> 4;
3618 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3621 if (vmask[0] & (x << (1 + ss_h))) {
3622 av_assert2(l[1 + ss_h] == L);
3623 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3625 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3627 } else if (vm & (x << (1 + ss_h))) {
3630 E |= s->filter_lut.mblim_lut[L] << 8;
3631 I |= s->filter_lut.lim_lut[L] << 8;
3632 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3633 [!!(vmask[1] & (x << (1 + ss_h)))]
3634 [1](ptr, ls, E, I, H);
3636 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3637 [1](ptr, ls, E, I, H);
3639 } else if (vm & (x << (1 + ss_h))) {
3640 int L = l[1 + ss_h], H = L >> 4;
3641 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3643 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3644 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3649 int L = *l, H = L >> 4;
3650 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3652 if (vm3 & (x << (1 + ss_h))) {
3655 E |= s->filter_lut.mblim_lut[L] << 8;
3656 I |= s->filter_lut.lim_lut[L] << 8;
3657 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3659 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3661 } else if (vm3 & (x << (1 + ss_h))) {
3662 int L = l[1 + ss_h], H = L >> 4;
3663 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3665 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3678 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3679 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3681 VP9Context *s = ctx->priv_data;
3682 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3683 uint8_t *dst = f->data[0] + yoff;
3684 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3685 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3688 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3689 // if you think of them as acting on a 8x8 block max, we can interleave
3690 // each v/h within the single x loop, but that only works if we work on
3691 // 8 pixel blocks, and we won't always do that (we want at least 16px
3692 // to use SSE2 optimizations, perhaps 32 for AVX2)
3694 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3695 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3697 for (p = 0; p < 2; p++) {
3698 dst = f->data[1 + p] + uvoff;
3699 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3700 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3704 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3706 int sb_start = ( idx * n) >> log2_n;
3707 int sb_end = ((idx + 1) * n) >> log2_n;
3708 *start = FFMIN(sb_start, n) << 3;
3709 *end = FFMIN(sb_end, n) << 3;
3712 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3713 int max_count, int update_factor)
3715 unsigned ct = ct0 + ct1, p2, p1;
3720 update_factor = FASTDIV(update_factor * FFMIN(ct, max_count), max_count);
3722 p2 = ((((int64_t) ct0) << 8) + (ct >> 1)) / ct;
3723 p2 = av_clip(p2, 1, 255);
3725 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3726 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3729 static void adapt_probs(VP9Context *s)
3732 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3733 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3736 for (i = 0; i < 4; i++)
3737 for (j = 0; j < 2; j++)
3738 for (k = 0; k < 2; k++)
3739 for (l = 0; l < 6; l++)
3740 for (m = 0; m < 6; m++) {
3741 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3742 unsigned *e = s->counts.eob[i][j][k][l][m];
3743 unsigned *c = s->counts.coef[i][j][k][l][m];
3745 if (l == 0 && m >= 3) // dc only has 3 pt
3748 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3749 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3750 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3753 if (s->s.h.keyframe || s->s.h.intraonly) {
3754 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3755 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3756 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3757 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3762 for (i = 0; i < 3; i++)
3763 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3766 for (i = 0; i < 4; i++)
3767 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3770 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3771 for (i = 0; i < 5; i++)
3772 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3776 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3777 for (i = 0; i < 5; i++)
3778 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3779 s->counts.comp_ref[i][1], 20, 128);
3782 if (s->s.h.comppredmode != PRED_COMPREF) {
3783 for (i = 0; i < 5; i++) {
3784 uint8_t *pp = p->single_ref[i];
3785 unsigned (*c)[2] = s->counts.single_ref[i];
3787 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3788 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3792 // block partitioning
3793 for (i = 0; i < 4; i++)
3794 for (j = 0; j < 4; j++) {
3795 uint8_t *pp = p->partition[i][j];
3796 unsigned *c = s->counts.partition[i][j];
3798 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3799 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3800 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3804 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3805 for (i = 0; i < 2; i++) {
3806 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3808 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3809 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3810 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3811 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3812 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3813 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3817 // interpolation filter
3818 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3819 for (i = 0; i < 4; i++) {
3820 uint8_t *pp = p->filter[i];
3821 unsigned *c = s->counts.filter[i];
3823 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3824 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3829 for (i = 0; i < 7; i++) {
3830 uint8_t *pp = p->mv_mode[i];
3831 unsigned *c = s->counts.mv_mode[i];
3833 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3834 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3835 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3840 uint8_t *pp = p->mv_joint;
3841 unsigned *c = s->counts.mv_joint;
3843 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3844 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3845 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3849 for (i = 0; i < 2; i++) {
3851 unsigned *c, (*c2)[2], sum;
3853 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3854 s->counts.mv_comp[i].sign[1], 20, 128);
3856 pp = p->mv_comp[i].classes;
3857 c = s->counts.mv_comp[i].classes;
3858 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3859 adapt_prob(&pp[0], c[0], sum, 20, 128);
3861 adapt_prob(&pp[1], c[1], sum, 20, 128);
3863 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3864 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3866 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3867 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3869 adapt_prob(&pp[6], c[6], sum, 20, 128);
3870 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3871 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3872 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3874 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3875 s->counts.mv_comp[i].class0[1], 20, 128);
3876 pp = p->mv_comp[i].bits;
3877 c2 = s->counts.mv_comp[i].bits;
3878 for (j = 0; j < 10; j++)
3879 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3881 for (j = 0; j < 2; j++) {
3882 pp = p->mv_comp[i].class0_fp[j];
3883 c = s->counts.mv_comp[i].class0_fp[j];
3884 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3885 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3886 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3888 pp = p->mv_comp[i].fp;
3889 c = s->counts.mv_comp[i].fp;
3890 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3891 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3892 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3894 if (s->s.h.highprecisionmvs) {
3895 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3896 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3897 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3898 s->counts.mv_comp[i].hp[1], 20, 128);
3903 for (i = 0; i < 4; i++) {
3904 uint8_t *pp = p->y_mode[i];
3905 unsigned *c = s->counts.y_mode[i], sum, s2;
3907 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3908 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3909 sum -= c[TM_VP8_PRED];
3910 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3911 sum -= c[VERT_PRED];
3912 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3913 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3915 adapt_prob(&pp[3], s2, sum, 20, 128);
3917 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3918 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3919 sum -= c[DIAG_DOWN_LEFT_PRED];
3920 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3921 sum -= c[VERT_LEFT_PRED];
3922 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3923 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3927 for (i = 0; i < 10; i++) {
3928 uint8_t *pp = p->uv_mode[i];
3929 unsigned *c = s->counts.uv_mode[i], sum, s2;
3931 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3932 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3933 sum -= c[TM_VP8_PRED];
3934 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3935 sum -= c[VERT_PRED];
3936 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3937 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3939 adapt_prob(&pp[3], s2, sum, 20, 128);
3941 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3942 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3943 sum -= c[DIAG_DOWN_LEFT_PRED];
3944 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3945 sum -= c[VERT_LEFT_PRED];
3946 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3947 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3951 static void free_buffers(VP9Context *s)
3953 av_freep(&s->intra_pred_data[0]);
3954 av_freep(&s->b_base);
3955 av_freep(&s->block_base);
3958 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3960 VP9Context *s = ctx->priv_data;
3963 for (i = 0; i < 3; i++) {
3964 if (s->s.frames[i].tf.f->buf[0])
3965 vp9_unref_frame(ctx, &s->s.frames[i]);
3966 av_frame_free(&s->s.frames[i].tf.f);
3968 for (i = 0; i < 8; i++) {
3969 if (s->s.refs[i].f->buf[0])
3970 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3971 av_frame_free(&s->s.refs[i].f);
3972 if (s->next_refs[i].f->buf[0])
3973 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3974 av_frame_free(&s->next_refs[i].f);
3984 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3985 int *got_frame, AVPacket *pkt)
3987 const uint8_t *data = pkt->data;
3988 int size = pkt->size;
3989 VP9Context *s = ctx->priv_data;
3990 int res, tile_row, tile_col, i, ref, row, col;
3991 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3992 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3993 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3997 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3999 } else if (res == 0) {
4000 if (!s->s.refs[ref].f->buf[0]) {
4001 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4002 return AVERROR_INVALIDDATA;
4004 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
4006 ((AVFrame *)frame)->pts = pkt->pts;
4008 FF_DISABLE_DEPRECATION_WARNINGS
4009 ((AVFrame *)frame)->pkt_pts = pkt->pts;
4010 FF_ENABLE_DEPRECATION_WARNINGS
4012 ((AVFrame *)frame)->pkt_dts = pkt->dts;
4013 for (i = 0; i < 8; i++) {
4014 if (s->next_refs[i].f->buf[0])
4015 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4016 if (s->s.refs[i].f->buf[0] &&
4017 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4026 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4027 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4028 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4029 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4030 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4033 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4034 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
4035 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4036 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4038 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4039 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4040 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4042 f = s->s.frames[CUR_FRAME].tf.f;
4043 f->key_frame = s->s.h.keyframe;
4044 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4045 ls_y = f->linesize[0];
4046 ls_uv =f->linesize[1];
4048 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4049 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4050 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4051 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4055 for (i = 0; i < 8; i++) {
4056 if (s->next_refs[i].f->buf[0])
4057 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4058 if (s->s.h.refreshrefmask & (1 << i)) {
4059 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4060 } else if (s->s.refs[i].f->buf[0]) {
4061 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4068 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4071 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4074 res = ctx->hwaccel->end_frame(ctx);
4080 // main tile decode loop
4081 bytesperpixel = s->bytesperpixel;
4082 memset(s->above_partition_ctx, 0, s->cols);
4083 memset(s->above_skip_ctx, 0, s->cols);
4084 if (s->s.h.keyframe || s->s.h.intraonly) {
4085 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4087 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4089 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4090 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4091 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4092 memset(s->above_segpred_ctx, 0, s->cols);
4093 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4094 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4095 if ((res = update_block_buffers(ctx)) < 0) {
4096 av_log(ctx, AV_LOG_ERROR,
4097 "Failed to allocate block buffers\n");
4100 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4103 for (i = 0; i < 4; i++) {
4104 for (j = 0; j < 2; j++)
4105 for (k = 0; k < 2; k++)
4106 for (l = 0; l < 6; l++)
4107 for (m = 0; m < 6; m++)
4108 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4109 s->prob.coef[i][j][k][l][m], 3);
4110 if (s->s.h.txfmmode == i)
4113 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4114 ff_thread_finish_setup(ctx);
4115 } else if (!s->s.h.refreshctx) {
4116 ff_thread_finish_setup(ctx);
4122 s->block = s->block_base;
4123 s->uvblock[0] = s->uvblock_base[0];
4124 s->uvblock[1] = s->uvblock_base[1];
4125 s->eob = s->eob_base;
4126 s->uveob[0] = s->uveob_base[0];
4127 s->uveob[1] = s->uveob_base[1];
4129 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4130 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4131 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4133 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4136 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4137 tile_row == s->s.h.tiling.tile_rows - 1) {
4140 tile_size = AV_RB32(data);
4144 if (tile_size > size) {
4145 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4146 return AVERROR_INVALIDDATA;
4148 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4149 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4150 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4151 return AVERROR_INVALIDDATA;
4158 for (row = s->tile_row_start; row < s->tile_row_end;
4159 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4160 struct VP9Filter *lflvl_ptr = s->lflvl;
4161 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4163 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4164 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4165 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4168 memset(s->left_partition_ctx, 0, 8);
4169 memset(s->left_skip_ctx, 0, 8);
4170 if (s->s.h.keyframe || s->s.h.intraonly) {
4171 memset(s->left_mode_ctx, DC_PRED, 16);
4173 memset(s->left_mode_ctx, NEARESTMV, 8);
4175 memset(s->left_y_nnz_ctx, 0, 16);
4176 memset(s->left_uv_nnz_ctx, 0, 32);
4177 memset(s->left_segpred_ctx, 0, 8);
4179 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4182 for (col = s->tile_col_start;
4183 col < s->tile_col_end;
4184 col += 8, yoff2 += 64 * bytesperpixel,
4185 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4186 // FIXME integrate with lf code (i.e. zero after each
4187 // use, similar to invtxfm coefficients, or similar)
4189 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4193 decode_sb_mem(ctx, row, col, lflvl_ptr,
4194 yoff2, uvoff2, BL_64X64);
4196 decode_sb(ctx, row, col, lflvl_ptr,
4197 yoff2, uvoff2, BL_64X64);
4201 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4209 // backup pre-loopfilter reconstruction data for intra
4210 // prediction of next row of sb64s
4211 if (row + 8 < s->rows) {
4212 memcpy(s->intra_pred_data[0],
4213 f->data[0] + yoff + 63 * ls_y,
4214 8 * s->cols * bytesperpixel);
4215 memcpy(s->intra_pred_data[1],
4216 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4217 8 * s->cols * bytesperpixel >> s->ss_h);
4218 memcpy(s->intra_pred_data[2],
4219 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4220 8 * s->cols * bytesperpixel >> s->ss_h);
4223 // loopfilter one row
4224 if (s->s.h.filter.level) {
4227 lflvl_ptr = s->lflvl;
4228 for (col = 0; col < s->cols;
4229 col += 8, yoff2 += 64 * bytesperpixel,
4230 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4231 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4235 // FIXME maybe we can make this more finegrained by running the
4236 // loopfilter per-block instead of after each sbrow
4237 // In fact that would also make intra pred left preparation easier?
4238 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4242 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4244 ff_thread_finish_setup(ctx);
4246 } while (s->pass++ == 1);
4247 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4251 for (i = 0; i < 8; i++) {
4252 if (s->s.refs[i].f->buf[0])
4253 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4254 if (s->next_refs[i].f->buf[0] &&
4255 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4259 if (!s->s.h.invisible) {
4260 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4268 static void vp9_decode_flush(AVCodecContext *ctx)
4270 VP9Context *s = ctx->priv_data;
4273 for (i = 0; i < 3; i++)
4274 vp9_unref_frame(ctx, &s->s.frames[i]);
4275 for (i = 0; i < 8; i++)
4276 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4279 static int init_frames(AVCodecContext *ctx)
4281 VP9Context *s = ctx->priv_data;
4284 for (i = 0; i < 3; i++) {
4285 s->s.frames[i].tf.f = av_frame_alloc();
4286 if (!s->s.frames[i].tf.f) {
4287 vp9_decode_free(ctx);
4288 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4289 return AVERROR(ENOMEM);
4292 for (i = 0; i < 8; i++) {
4293 s->s.refs[i].f = av_frame_alloc();
4294 s->next_refs[i].f = av_frame_alloc();
4295 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4296 vp9_decode_free(ctx);
4297 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4298 return AVERROR(ENOMEM);
4305 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4307 VP9Context *s = ctx->priv_data;
4309 ctx->internal->allocate_progress = 1;
4311 s->s.h.filter.sharpness = -1;
4313 return init_frames(ctx);
4317 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4319 return init_frames(avctx);
4322 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4325 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4327 for (i = 0; i < 3; i++) {
4328 if (s->s.frames[i].tf.f->buf[0])
4329 vp9_unref_frame(dst, &s->s.frames[i]);
4330 if (ssrc->s.frames[i].tf.f->buf[0]) {
4331 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4335 for (i = 0; i < 8; i++) {
4336 if (s->s.refs[i].f->buf[0])
4337 ff_thread_release_buffer(dst, &s->s.refs[i]);
4338 if (ssrc->next_refs[i].f->buf[0]) {
4339 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4344 s->s.h.invisible = ssrc->s.h.invisible;
4345 s->s.h.keyframe = ssrc->s.h.keyframe;
4346 s->s.h.intraonly = ssrc->s.h.intraonly;
4347 s->ss_v = ssrc->ss_v;
4348 s->ss_h = ssrc->ss_h;
4349 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4350 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4351 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4352 s->bytesperpixel = ssrc->bytesperpixel;
4353 s->gf_fmt = ssrc->gf_fmt;
4357 s->bpp_index = ssrc->bpp_index;
4358 s->pix_fmt = ssrc->pix_fmt;
4359 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4360 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4361 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4362 sizeof(s->s.h.segmentation.feat));
4368 AVCodec ff_vp9_decoder = {
4370 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4371 .type = AVMEDIA_TYPE_VIDEO,
4372 .id = AV_CODEC_ID_VP9,
4373 .priv_data_size = sizeof(VP9Context),
4374 .init = vp9_decode_init,
4375 .close = vp9_decode_free,
4376 .decode = vp9_decode_frame,
4377 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4378 .flush = vp9_decode_flush,
4379 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4380 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4381 .profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),