2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
37 #define VP9_SYNCCODE 0x498342
41 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
45 typedef struct VP9Block {
46 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
47 enum FilterMode filter;
48 VP56mv mv[4 /* b_idx */][2 /* ref */];
50 enum TxfmMode tx, uvtx;
52 enum BlockPartition bp;
55 typedef struct VP9Context {
66 int row, row7, col, col7;
68 ptrdiff_t y_stride, uv_stride;
71 uint8_t last_bpp, bpp_index, bytesperpixel;
72 uint8_t last_keyframe;
73 // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74 // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75 // and are therefore per-stream. pix_fmt represents the value in the header
76 // of the currently processed frame.
78 enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79 unsigned sb_cols, sb_rows, rows, cols;
80 ThreadFrame next_refs[8];
84 uint8_t mblim_lut[64];
86 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
89 uint8_t coef[4][2][2][6][6][3];
93 uint8_t coef[4][2][2][6][6][11];
96 unsigned y_mode[4][10];
97 unsigned uv_mode[10][10];
98 unsigned filter[4][3];
99 unsigned mv_mode[7][4];
100 unsigned intra[4][2];
102 unsigned single_ref[5][2][2];
103 unsigned comp_ref[5][2];
104 unsigned tx32p[2][4];
105 unsigned tx16p[2][3];
108 unsigned mv_joint[4];
111 unsigned classes[11];
113 unsigned bits[10][2];
114 unsigned class0_fp[2][4];
116 unsigned class0_hp[2];
119 unsigned partition[4][4][4];
120 unsigned coef[4][2][2][6][6][3];
121 unsigned eob[4][2][2][6][6][2];
124 // contextual (left/above) cache
125 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
126 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
127 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
128 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
129 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
131 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
132 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
133 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
134 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
135 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
136 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
137 uint8_t *above_partition_ctx;
138 uint8_t *above_mode_ctx;
139 // FIXME maybe merge some of the below in a flags field?
140 uint8_t *above_y_nnz_ctx;
141 uint8_t *above_uv_nnz_ctx[2];
142 uint8_t *above_skip_ctx; // 1bit
143 uint8_t *above_txfm_ctx; // 2bit
144 uint8_t *above_segpred_ctx; // 1bit
145 uint8_t *above_intra_ctx; // 1bit
146 uint8_t *above_comp_ctx; // 1bit
147 uint8_t *above_ref_ctx; // 2bit
148 uint8_t *above_filter_ctx;
149 VP56mv (*above_mv_ctx)[2];
152 uint8_t *intra_pred_data[3];
153 struct VP9Filter *lflvl;
154 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
156 // block reconstruction intermediates
157 int block_alloc_using_2pass;
158 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
159 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
160 struct { int x, y; } min_mv, max_mv;
161 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163 uint16_t mvscale[3][2];
164 uint8_t mvstep[3][2];
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
169 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
172 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
177 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
179 ff_thread_release_buffer(ctx, &f->tf);
180 av_buffer_unref(&f->extradata);
181 av_buffer_unref(&f->hwaccel_priv_buf);
182 f->segmentation_map = NULL;
183 f->hwaccel_picture_private = NULL;
186 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
188 VP9Context *s = ctx->priv_data;
191 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
193 sz = 64 * s->sb_cols * s->sb_rows;
194 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
198 f->segmentation_map = f->extradata->data;
199 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
202 const AVHWAccel *hwaccel = ctx->hwaccel;
203 av_assert0(!f->hwaccel_picture_private);
204 if (hwaccel->frame_priv_data_size) {
205 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
206 if (!f->hwaccel_priv_buf)
208 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
215 vp9_unref_frame(ctx, f);
216 return AVERROR(ENOMEM);
219 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
223 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
225 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
229 dst->segmentation_map = src->segmentation_map;
231 dst->uses_2pass = src->uses_2pass;
233 if (src->hwaccel_picture_private) {
234 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
235 if (!dst->hwaccel_priv_buf)
237 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
243 vp9_unref_frame(ctx, dst);
244 return AVERROR(ENOMEM);
247 static int update_size(AVCodecContext *ctx, int w, int h)
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251 VP9Context *s = ctx->priv_data;
253 int bytesperpixel = s->bytesperpixel, res, cols, rows;
255 av_assert0(w > 0 && h > 0);
257 if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
261 switch (s->pix_fmt) {
262 case AV_PIX_FMT_YUV420P:
263 #if CONFIG_VP9_DXVA2_HWACCEL
264 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
266 #if CONFIG_VP9_D3D11VA_HWACCEL
267 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
269 #if CONFIG_VP9_VAAPI_HWACCEL
270 *fmtp++ = AV_PIX_FMT_VAAPI;
273 case AV_PIX_FMT_YUV420P10:
274 case AV_PIX_FMT_YUV420P12:
275 #if CONFIG_VP9_VAAPI_HWACCEL
276 *fmtp++ = AV_PIX_FMT_VAAPI;
281 *fmtp++ = s->pix_fmt;
282 *fmtp = AV_PIX_FMT_NONE;
284 res = ff_thread_get_format(ctx, pix_fmts);
289 s->gf_fmt = s->pix_fmt;
297 if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
300 s->last_fmt = s->pix_fmt;
301 s->sb_cols = (w + 63) >> 6;
302 s->sb_rows = (h + 63) >> 6;
303 s->cols = (w + 7) >> 3;
304 s->rows = (h + 7) >> 3;
306 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
307 av_freep(&s->intra_pred_data[0]);
308 // FIXME we slightly over-allocate here for subsampled chroma, but a little
309 // bit of padding shouldn't affect performance...
310 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
311 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
313 return AVERROR(ENOMEM);
314 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
315 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
316 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
317 assign(s->above_y_nnz_ctx, uint8_t *, 16);
318 assign(s->above_mode_ctx, uint8_t *, 16);
319 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
320 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
321 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
322 assign(s->above_partition_ctx, uint8_t *, 8);
323 assign(s->above_skip_ctx, uint8_t *, 8);
324 assign(s->above_txfm_ctx, uint8_t *, 8);
325 assign(s->above_segpred_ctx, uint8_t *, 8);
326 assign(s->above_intra_ctx, uint8_t *, 8);
327 assign(s->above_comp_ctx, uint8_t *, 8);
328 assign(s->above_ref_ctx, uint8_t *, 8);
329 assign(s->above_filter_ctx, uint8_t *, 8);
330 assign(s->lflvl, struct VP9Filter *, 1);
333 // these will be re-allocated a little later
334 av_freep(&s->b_base);
335 av_freep(&s->block_base);
337 if (s->s.h.bpp != s->last_bpp) {
338 ff_vp9dsp_init(&s->dsp, s->s.h.bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
339 ff_videodsp_init(&s->vdsp, s->s.h.bpp);
340 s->last_bpp = s->s.h.bpp;
346 static int update_block_buffers(AVCodecContext *ctx)
348 VP9Context *s = ctx->priv_data;
349 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
351 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
355 av_free(s->block_base);
356 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
357 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
358 if (s->s.frames[CUR_FRAME].uses_2pass) {
359 int sbs = s->sb_cols * s->sb_rows;
361 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
362 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
363 16 * 16 + 2 * chroma_eobs) * sbs);
364 if (!s->b_base || !s->block_base)
365 return AVERROR(ENOMEM);
366 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
367 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
368 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
369 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
370 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
372 s->b_base = av_malloc(sizeof(VP9Block));
373 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
374 16 * 16 + 2 * chroma_eobs);
375 if (!s->b_base || !s->block_base)
376 return AVERROR(ENOMEM);
377 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
378 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
379 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
380 s->uveob_base[0] = s->eob_base + 16 * 16;
381 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
383 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
388 // for some reason the sign bit is at the end, not the start, of a bit sequence
389 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
391 int v = get_bits(gb, n);
392 return get_bits1(gb) ? -v : v;
395 static av_always_inline int inv_recenter_nonneg(int v, int m)
397 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
400 // differential forward probability updates
401 static int update_prob(VP56RangeCoder *c, int p)
403 static const int inv_map_table[255] = {
404 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
405 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
406 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
407 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
408 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
409 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
410 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
411 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
412 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
413 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
414 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
415 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
416 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
417 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
418 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
419 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
420 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
421 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
426 /* This code is trying to do a differential probability update. For a
427 * current probability A in the range [1, 255], the difference to a new
428 * probability of any value can be expressed differentially as 1-A,255-A
429 * where some part of this (absolute range) exists both in positive as
430 * well as the negative part, whereas another part only exists in one
431 * half. We're trying to code this shared part differentially, i.e.
432 * times two where the value of the lowest bit specifies the sign, and
433 * the single part is then coded on top of this. This absolute difference
434 * then again has a value of [0,254], but a bigger value in this range
435 * indicates that we're further away from the original value A, so we
436 * can code this as a VLC code, since higher values are increasingly
437 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
438 * updates vs. the 'fine, exact' updates further down the range, which
439 * adds one extra dimension to this differential update model. */
441 if (!vp8_rac_get(c)) {
442 d = vp8_rac_get_uint(c, 4) + 0;
443 } else if (!vp8_rac_get(c)) {
444 d = vp8_rac_get_uint(c, 4) + 16;
445 } else if (!vp8_rac_get(c)) {
446 d = vp8_rac_get_uint(c, 5) + 32;
448 d = vp8_rac_get_uint(c, 7);
450 d = (d << 1) - 65 + vp8_rac_get(c);
452 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
455 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
456 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
459 static int read_colorspace_details(AVCodecContext *ctx)
461 static const enum AVColorSpace colorspaces[8] = {
462 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
463 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
465 VP9Context *s = ctx->priv_data;
466 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
469 s->s.h.bpp = 8 + bits * 2;
470 s->bytesperpixel = (7 + s->s.h.bpp) >> 3;
471 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
472 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
473 static const enum AVPixelFormat pix_fmt_rgb[3] = {
474 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
476 s->ss_h = s->ss_v = 0;
477 ctx->color_range = AVCOL_RANGE_JPEG;
478 s->pix_fmt = pix_fmt_rgb[bits];
479 if (ctx->profile & 1) {
480 if (get_bits1(&s->gb)) {
481 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
482 return AVERROR_INVALIDDATA;
485 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
487 return AVERROR_INVALIDDATA;
490 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
491 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
492 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
493 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
494 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
495 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
496 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
498 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
499 if (ctx->profile & 1) {
500 s->ss_h = get_bits1(&s->gb);
501 s->ss_v = get_bits1(&s->gb);
502 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
503 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
504 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
506 return AVERROR_INVALIDDATA;
507 } else if (get_bits1(&s->gb)) {
508 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
510 return AVERROR_INVALIDDATA;
513 s->ss_h = s->ss_v = 1;
514 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
521 static int decode_frame_header(AVCodecContext *ctx,
522 const uint8_t *data, int size, int *ref)
524 VP9Context *s = ctx->priv_data;
525 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
527 const uint8_t *data2;
530 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
531 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
534 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
535 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
536 return AVERROR_INVALIDDATA;
538 ctx->profile = get_bits1(&s->gb);
539 ctx->profile |= get_bits1(&s->gb) << 1;
540 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
541 if (ctx->profile > 3) {
542 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
543 return AVERROR_INVALIDDATA;
545 s->s.h.profile = ctx->profile;
546 if (get_bits1(&s->gb)) {
547 *ref = get_bits(&s->gb, 3);
550 s->last_keyframe = s->s.h.keyframe;
551 s->s.h.keyframe = !get_bits1(&s->gb);
552 last_invisible = s->s.h.invisible;
553 s->s.h.invisible = !get_bits1(&s->gb);
554 s->s.h.errorres = get_bits1(&s->gb);
555 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
556 if (s->s.h.keyframe) {
557 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
558 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
559 return AVERROR_INVALIDDATA;
561 if ((res = read_colorspace_details(ctx)) < 0)
563 // for profile 1, here follows the subsampling bits
564 s->s.h.refreshrefmask = 0xff;
565 w = get_bits(&s->gb, 16) + 1;
566 h = get_bits(&s->gb, 16) + 1;
567 if (get_bits1(&s->gb)) // display size
568 skip_bits(&s->gb, 32);
570 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
571 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
572 if (s->s.h.intraonly) {
573 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
574 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
575 return AVERROR_INVALIDDATA;
577 if (ctx->profile >= 1) {
578 if ((res = read_colorspace_details(ctx)) < 0)
581 s->ss_h = s->ss_v = 1;
584 s->bytesperpixel = 1;
585 s->pix_fmt = AV_PIX_FMT_YUV420P;
586 ctx->colorspace = AVCOL_SPC_BT470BG;
587 ctx->color_range = AVCOL_RANGE_JPEG;
589 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
590 w = get_bits(&s->gb, 16) + 1;
591 h = get_bits(&s->gb, 16) + 1;
592 if (get_bits1(&s->gb)) // display size
593 skip_bits(&s->gb, 32);
595 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
596 s->s.h.refidx[0] = get_bits(&s->gb, 3);
597 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
598 s->s.h.refidx[1] = get_bits(&s->gb, 3);
599 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
600 s->s.h.refidx[2] = get_bits(&s->gb, 3);
601 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
602 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
603 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
604 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
605 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
606 return AVERROR_INVALIDDATA;
608 if (get_bits1(&s->gb)) {
609 w = s->s.refs[s->s.h.refidx[0]].f->width;
610 h = s->s.refs[s->s.h.refidx[0]].f->height;
611 } else if (get_bits1(&s->gb)) {
612 w = s->s.refs[s->s.h.refidx[1]].f->width;
613 h = s->s.refs[s->s.h.refidx[1]].f->height;
614 } else if (get_bits1(&s->gb)) {
615 w = s->s.refs[s->s.h.refidx[2]].f->width;
616 h = s->s.refs[s->s.h.refidx[2]].f->height;
618 w = get_bits(&s->gb, 16) + 1;
619 h = get_bits(&s->gb, 16) + 1;
621 // Note that in this code, "CUR_FRAME" is actually before we
622 // have formally allocated a frame, and thus actually represents
624 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
625 s->s.frames[CUR_FRAME].tf.f->height == h;
626 if (get_bits1(&s->gb)) // display size
627 skip_bits(&s->gb, 32);
628 s->s.h.highprecisionmvs = get_bits1(&s->gb);
629 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
631 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
632 s->s.h.signbias[0] != s->s.h.signbias[2];
633 if (s->s.h.allowcompinter) {
634 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
635 s->s.h.fixcompref = 2;
636 s->s.h.varcompref[0] = 0;
637 s->s.h.varcompref[1] = 1;
638 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
639 s->s.h.fixcompref = 1;
640 s->s.h.varcompref[0] = 0;
641 s->s.h.varcompref[1] = 2;
643 s->s.h.fixcompref = 0;
644 s->s.h.varcompref[0] = 1;
645 s->s.h.varcompref[1] = 2;
650 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
651 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
652 s->s.h.framectxid = c = get_bits(&s->gb, 2);
653 if (s->s.h.keyframe || s->s.h.intraonly)
654 s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
656 /* loopfilter header data */
657 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
658 // reset loopfilter defaults
659 s->s.h.lf_delta.ref[0] = 1;
660 s->s.h.lf_delta.ref[1] = 0;
661 s->s.h.lf_delta.ref[2] = -1;
662 s->s.h.lf_delta.ref[3] = -1;
663 s->s.h.lf_delta.mode[0] = 0;
664 s->s.h.lf_delta.mode[1] = 0;
665 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
667 s->s.h.filter.level = get_bits(&s->gb, 6);
668 sharp = get_bits(&s->gb, 3);
669 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
670 // the old cache values since they are still valid
671 if (s->s.h.filter.sharpness != sharp)
672 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
673 s->s.h.filter.sharpness = sharp;
674 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
675 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
676 for (i = 0; i < 4; i++)
677 if (get_bits1(&s->gb))
678 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
679 for (i = 0; i < 2; i++)
680 if (get_bits1(&s->gb))
681 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
685 /* quantization header data */
686 s->s.h.yac_qi = get_bits(&s->gb, 8);
687 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
688 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
689 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
690 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
691 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
693 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
695 /* segmentation header info */
696 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
697 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
698 for (i = 0; i < 7; i++)
699 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
700 get_bits(&s->gb, 8) : 255;
701 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
702 for (i = 0; i < 3; i++)
703 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
704 get_bits(&s->gb, 8) : 255;
708 if (get_bits1(&s->gb)) {
709 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
710 for (i = 0; i < 8; i++) {
711 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
712 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
713 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
714 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
715 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
716 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
717 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
722 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
723 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
724 int qyac, qydc, quvac, quvdc, lflvl, sh;
726 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
727 if (s->s.h.segmentation.absolute_vals)
728 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
730 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
732 qyac = s->s.h.yac_qi;
734 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
735 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
736 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
737 qyac = av_clip_uintp2(qyac, 8);
739 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
740 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
741 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
742 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
744 sh = s->s.h.filter.level >= 32;
745 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
746 if (s->s.h.segmentation.absolute_vals)
747 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
749 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
751 lflvl = s->s.h.filter.level;
753 if (s->s.h.lf_delta.enabled) {
754 s->s.h.segmentation.feat[i].lflvl[0][0] =
755 s->s.h.segmentation.feat[i].lflvl[0][1] =
756 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
757 for (j = 1; j < 4; j++) {
758 s->s.h.segmentation.feat[i].lflvl[j][0] =
759 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
760 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
761 s->s.h.segmentation.feat[i].lflvl[j][1] =
762 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
763 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
766 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
767 sizeof(s->s.h.segmentation.feat[i].lflvl));
772 if ((res = update_size(ctx, w, h)) < 0) {
773 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
777 for (s->s.h.tiling.log2_tile_cols = 0;
778 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
779 s->s.h.tiling.log2_tile_cols++) ;
780 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
781 max = FFMAX(0, max - 1);
782 while (max > s->s.h.tiling.log2_tile_cols) {
783 if (get_bits1(&s->gb))
784 s->s.h.tiling.log2_tile_cols++;
788 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
789 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
790 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
791 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
792 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
793 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
795 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
796 return AVERROR(ENOMEM);
800 /* check reference frames */
801 if (!s->s.h.keyframe && !s->s.h.intraonly) {
802 for (i = 0; i < 3; i++) {
803 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
804 int refw = ref->width, refh = ref->height;
806 if (ref->format != ctx->pix_fmt) {
807 av_log(ctx, AV_LOG_ERROR,
808 "Ref pixfmt (%s) did not match current frame (%s)",
809 av_get_pix_fmt_name(ref->format),
810 av_get_pix_fmt_name(ctx->pix_fmt));
811 return AVERROR_INVALIDDATA;
812 } else if (refw == w && refh == h) {
813 s->mvscale[i][0] = s->mvscale[i][1] = 0;
815 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
816 av_log(ctx, AV_LOG_ERROR,
817 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
819 return AVERROR_INVALIDDATA;
821 s->mvscale[i][0] = (refw << 14) / w;
822 s->mvscale[i][1] = (refh << 14) / h;
823 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
824 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
829 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
830 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
831 s->prob_ctx[3].p = vp9_default_probs;
832 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
833 sizeof(vp9_default_coef_probs));
834 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
835 sizeof(vp9_default_coef_probs));
836 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
837 sizeof(vp9_default_coef_probs));
838 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
839 sizeof(vp9_default_coef_probs));
840 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
841 s->prob_ctx[c].p = vp9_default_probs;
842 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
843 sizeof(vp9_default_coef_probs));
846 // next 16 bits is size of the rest of the header (arith-coded)
847 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
848 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
850 data2 = align_get_bits(&s->gb);
851 if (size2 > size - (data2 - data)) {
852 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
853 return AVERROR_INVALIDDATA;
855 res = ff_vp56_init_range_decoder(&s->c, data2, size2);
859 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
860 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
861 return AVERROR_INVALIDDATA;
864 if (s->s.h.keyframe || s->s.h.intraonly) {
865 memset(s->counts.coef, 0, sizeof(s->counts.coef));
866 memset(s->counts.eob, 0, sizeof(s->counts.eob));
868 memset(&s->counts, 0, sizeof(s->counts));
870 // FIXME is it faster to not copy here, but do it down in the fw updates
871 // as explicit copies if the fw update is missing (and skip the copy upon
873 s->prob.p = s->prob_ctx[c].p;
876 if (s->s.h.lossless) {
877 s->s.h.txfmmode = TX_4X4;
879 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
880 if (s->s.h.txfmmode == 3)
881 s->s.h.txfmmode += vp8_rac_get(&s->c);
883 if (s->s.h.txfmmode == TX_SWITCHABLE) {
884 for (i = 0; i < 2; i++)
885 if (vp56_rac_get_prob_branchy(&s->c, 252))
886 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
887 for (i = 0; i < 2; i++)
888 for (j = 0; j < 2; j++)
889 if (vp56_rac_get_prob_branchy(&s->c, 252))
890 s->prob.p.tx16p[i][j] =
891 update_prob(&s->c, s->prob.p.tx16p[i][j]);
892 for (i = 0; i < 2; i++)
893 for (j = 0; j < 3; j++)
894 if (vp56_rac_get_prob_branchy(&s->c, 252))
895 s->prob.p.tx32p[i][j] =
896 update_prob(&s->c, s->prob.p.tx32p[i][j]);
901 for (i = 0; i < 4; i++) {
902 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
903 if (vp8_rac_get(&s->c)) {
904 for (j = 0; j < 2; j++)
905 for (k = 0; k < 2; k++)
906 for (l = 0; l < 6; l++)
907 for (m = 0; m < 6; m++) {
908 uint8_t *p = s->prob.coef[i][j][k][l][m];
909 uint8_t *r = ref[j][k][l][m];
910 if (m >= 3 && l == 0) // dc only has 3 pt
912 for (n = 0; n < 3; n++) {
913 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
914 p[n] = update_prob(&s->c, r[n]);
922 for (j = 0; j < 2; j++)
923 for (k = 0; k < 2; k++)
924 for (l = 0; l < 6; l++)
925 for (m = 0; m < 6; m++) {
926 uint8_t *p = s->prob.coef[i][j][k][l][m];
927 uint8_t *r = ref[j][k][l][m];
928 if (m > 3 && l == 0) // dc only has 3 pt
934 if (s->s.h.txfmmode == i)
939 for (i = 0; i < 3; i++)
940 if (vp56_rac_get_prob_branchy(&s->c, 252))
941 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
942 if (!s->s.h.keyframe && !s->s.h.intraonly) {
943 for (i = 0; i < 7; i++)
944 for (j = 0; j < 3; j++)
945 if (vp56_rac_get_prob_branchy(&s->c, 252))
946 s->prob.p.mv_mode[i][j] =
947 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
949 if (s->s.h.filtermode == FILTER_SWITCHABLE)
950 for (i = 0; i < 4; i++)
951 for (j = 0; j < 2; j++)
952 if (vp56_rac_get_prob_branchy(&s->c, 252))
953 s->prob.p.filter[i][j] =
954 update_prob(&s->c, s->prob.p.filter[i][j]);
956 for (i = 0; i < 4; i++)
957 if (vp56_rac_get_prob_branchy(&s->c, 252))
958 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
960 if (s->s.h.allowcompinter) {
961 s->s.h.comppredmode = vp8_rac_get(&s->c);
962 if (s->s.h.comppredmode)
963 s->s.h.comppredmode += vp8_rac_get(&s->c);
964 if (s->s.h.comppredmode == PRED_SWITCHABLE)
965 for (i = 0; i < 5; i++)
966 if (vp56_rac_get_prob_branchy(&s->c, 252))
968 update_prob(&s->c, s->prob.p.comp[i]);
970 s->s.h.comppredmode = PRED_SINGLEREF;
973 if (s->s.h.comppredmode != PRED_COMPREF) {
974 for (i = 0; i < 5; i++) {
975 if (vp56_rac_get_prob_branchy(&s->c, 252))
976 s->prob.p.single_ref[i][0] =
977 update_prob(&s->c, s->prob.p.single_ref[i][0]);
978 if (vp56_rac_get_prob_branchy(&s->c, 252))
979 s->prob.p.single_ref[i][1] =
980 update_prob(&s->c, s->prob.p.single_ref[i][1]);
984 if (s->s.h.comppredmode != PRED_SINGLEREF) {
985 for (i = 0; i < 5; i++)
986 if (vp56_rac_get_prob_branchy(&s->c, 252))
987 s->prob.p.comp_ref[i] =
988 update_prob(&s->c, s->prob.p.comp_ref[i]);
991 for (i = 0; i < 4; i++)
992 for (j = 0; j < 9; j++)
993 if (vp56_rac_get_prob_branchy(&s->c, 252))
994 s->prob.p.y_mode[i][j] =
995 update_prob(&s->c, s->prob.p.y_mode[i][j]);
997 for (i = 0; i < 4; i++)
998 for (j = 0; j < 4; j++)
999 for (k = 0; k < 3; k++)
1000 if (vp56_rac_get_prob_branchy(&s->c, 252))
1001 s->prob.p.partition[3 - i][j][k] =
1002 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1004 // mv fields don't use the update_prob subexp model for some reason
1005 for (i = 0; i < 3; i++)
1006 if (vp56_rac_get_prob_branchy(&s->c, 252))
1007 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1009 for (i = 0; i < 2; i++) {
1010 if (vp56_rac_get_prob_branchy(&s->c, 252))
1011 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1013 for (j = 0; j < 10; j++)
1014 if (vp56_rac_get_prob_branchy(&s->c, 252))
1015 s->prob.p.mv_comp[i].classes[j] =
1016 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1018 if (vp56_rac_get_prob_branchy(&s->c, 252))
1019 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1021 for (j = 0; j < 10; j++)
1022 if (vp56_rac_get_prob_branchy(&s->c, 252))
1023 s->prob.p.mv_comp[i].bits[j] =
1024 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1027 for (i = 0; i < 2; i++) {
1028 for (j = 0; j < 2; j++)
1029 for (k = 0; k < 3; k++)
1030 if (vp56_rac_get_prob_branchy(&s->c, 252))
1031 s->prob.p.mv_comp[i].class0_fp[j][k] =
1032 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1034 for (j = 0; j < 3; j++)
1035 if (vp56_rac_get_prob_branchy(&s->c, 252))
1036 s->prob.p.mv_comp[i].fp[j] =
1037 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1040 if (s->s.h.highprecisionmvs) {
1041 for (i = 0; i < 2; i++) {
1042 if (vp56_rac_get_prob_branchy(&s->c, 252))
1043 s->prob.p.mv_comp[i].class0_hp =
1044 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1046 if (vp56_rac_get_prob_branchy(&s->c, 252))
1047 s->prob.p.mv_comp[i].hp =
1048 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1053 return (data2 - data) + size2;
1056 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1059 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1060 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1063 static void find_ref_mvs(VP9Context *s,
1064 VP56mv *pmv, int ref, int z, int idx, int sb)
1066 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1067 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1068 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1069 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1070 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1071 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1072 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1073 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1074 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1075 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1076 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1077 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1078 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1079 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1080 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1081 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1082 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1083 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1084 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1085 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1086 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1087 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1088 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1089 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1090 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1091 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1092 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1095 int row = s->row, col = s->col, row7 = s->row7;
1096 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1097 #define INVALID_MV 0x80008000U
1098 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1101 #define RETURN_DIRECT_MV(mv) \
1103 uint32_t m = AV_RN32A(&mv); \
1107 } else if (mem == INVALID_MV) { \
1109 } else if (m != mem) { \
1116 if (sb == 2 || sb == 1) {
1117 RETURN_DIRECT_MV(b->mv[0][z]);
1118 } else if (sb == 3) {
1119 RETURN_DIRECT_MV(b->mv[2][z]);
1120 RETURN_DIRECT_MV(b->mv[1][z]);
1121 RETURN_DIRECT_MV(b->mv[0][z]);
1124 #define RETURN_MV(mv) \
1129 av_assert2(idx == 1); \
1130 av_assert2(mem != INVALID_MV); \
1131 if (mem_sub8x8 == INVALID_MV) { \
1132 clamp_mv(&tmp, &mv, s); \
1133 m = AV_RN32A(&tmp); \
1138 mem_sub8x8 = AV_RN32A(&mv); \
1139 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1140 clamp_mv(&tmp, &mv, s); \
1141 m = AV_RN32A(&tmp); \
1145 /* BUG I'm pretty sure this isn't the intention */ \
1151 uint32_t m = AV_RN32A(&mv); \
1153 clamp_mv(pmv, &mv, s); \
1155 } else if (mem == INVALID_MV) { \
1157 } else if (m != mem) { \
1158 clamp_mv(pmv, &mv, s); \
1165 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1166 if (mv->ref[0] == ref) {
1167 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1168 } else if (mv->ref[1] == ref) {
1169 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1172 if (col > s->tile_col_start) {
1173 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1174 if (mv->ref[0] == ref) {
1175 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1176 } else if (mv->ref[1] == ref) {
1177 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1185 // previously coded MVs in this neighbourhood, using same reference frame
1186 for (; i < 8; i++) {
1187 int c = p[i][0] + col, r = p[i][1] + row;
1189 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1190 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1192 if (mv->ref[0] == ref) {
1193 RETURN_MV(mv->mv[0]);
1194 } else if (mv->ref[1] == ref) {
1195 RETURN_MV(mv->mv[1]);
1200 // MV at this position in previous frame, using same reference frame
1201 if (s->s.h.use_last_frame_mvs) {
1202 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1204 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1205 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1206 if (mv->ref[0] == ref) {
1207 RETURN_MV(mv->mv[0]);
1208 } else if (mv->ref[1] == ref) {
1209 RETURN_MV(mv->mv[1]);
1213 #define RETURN_SCALE_MV(mv, scale) \
1216 VP56mv mv_temp = { -mv.x, -mv.y }; \
1217 RETURN_MV(mv_temp); \
1223 // previously coded MVs in this neighbourhood, using different reference frame
1224 for (i = 0; i < 8; i++) {
1225 int c = p[i][0] + col, r = p[i][1] + row;
1227 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1228 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1230 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1231 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1233 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1234 // BUG - libvpx has this condition regardless of whether
1235 // we used the first ref MV and pre-scaling
1236 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1237 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1242 // MV at this position in previous frame, using different reference frame
1243 if (s->s.h.use_last_frame_mvs) {
1244 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1246 // no need to await_progress, because we already did that above
1247 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1248 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1250 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1251 // BUG - libvpx has this condition regardless of whether
1252 // we used the first ref MV and pre-scaling
1253 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1254 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1259 clamp_mv(pmv, pmv, s);
1262 #undef RETURN_SCALE_MV
1265 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1267 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1268 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1269 s->prob.p.mv_comp[idx].classes);
1271 s->counts.mv_comp[idx].sign[sign]++;
1272 s->counts.mv_comp[idx].classes[c]++;
1276 for (n = 0, m = 0; m < c; m++) {
1277 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1279 s->counts.mv_comp[idx].bits[m][bit]++;
1282 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1284 s->counts.mv_comp[idx].fp[bit]++;
1286 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1287 s->counts.mv_comp[idx].hp[bit]++;
1291 // bug in libvpx - we count for bw entropy purposes even if the
1293 s->counts.mv_comp[idx].hp[1]++;
1297 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1298 s->counts.mv_comp[idx].class0[n]++;
1299 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1300 s->prob.p.mv_comp[idx].class0_fp[n]);
1301 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1302 n = (n << 3) | (bit << 1);
1304 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1305 s->counts.mv_comp[idx].class0_hp[bit]++;
1309 // bug in libvpx - we count for bw entropy purposes even if the
1311 s->counts.mv_comp[idx].class0_hp[1]++;
1315 return sign ? -(n + 1) : (n + 1);
1318 static void fill_mv(VP9Context *s,
1319 VP56mv *mv, int mode, int sb)
1323 if (mode == ZEROMV) {
1328 // FIXME cache this value and reuse for other subblocks
1329 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1330 mode == NEWMV ? -1 : sb);
1331 // FIXME maybe move this code into find_ref_mvs()
1332 if ((mode == NEWMV || sb == -1) &&
1333 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1347 if (mode == NEWMV) {
1348 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1349 s->prob.p.mv_joint);
1351 s->counts.mv_joint[j]++;
1352 if (j >= MV_JOINT_V)
1353 mv[0].y += read_mv_component(s, 0, hp);
1355 mv[0].x += read_mv_component(s, 1, hp);
1359 // FIXME cache this value and reuse for other subblocks
1360 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1361 mode == NEWMV ? -1 : sb);
1362 if ((mode == NEWMV || sb == -1) &&
1363 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1377 if (mode == NEWMV) {
1378 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1379 s->prob.p.mv_joint);
1381 s->counts.mv_joint[j]++;
1382 if (j >= MV_JOINT_V)
1383 mv[1].y += read_mv_component(s, 0, hp);
1385 mv[1].x += read_mv_component(s, 1, hp);
1391 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1392 ptrdiff_t stride, int v)
1402 int v16 = v * 0x0101;
1410 uint32_t v32 = v * 0x01010101;
1419 uint64_t v64 = v * 0x0101010101010101ULL;
1425 uint32_t v32 = v * 0x01010101;
1428 AV_WN32A(ptr + 4, v32);
1437 static void decode_mode(AVCodecContext *ctx)
1439 static const uint8_t left_ctx[N_BS_SIZES] = {
1440 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1442 static const uint8_t above_ctx[N_BS_SIZES] = {
1443 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1445 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1446 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1447 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1449 VP9Context *s = ctx->priv_data;
1451 int row = s->row, col = s->col, row7 = s->row7;
1452 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1453 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1454 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1455 int have_a = row > 0, have_l = col > s->tile_col_start;
1456 int vref, filter_id;
1458 if (!s->s.h.segmentation.enabled) {
1460 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1461 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1462 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1463 } else if (!s->s.h.segmentation.update_map ||
1464 (s->s.h.segmentation.temporal &&
1465 vp56_rac_get_prob_branchy(&s->c,
1466 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1467 s->left_segpred_ctx[row7]]))) {
1468 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1470 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1472 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1473 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1474 for (y = 0; y < h4; y++) {
1475 int idx_base = (y + row) * 8 * s->sb_cols + col;
1476 for (x = 0; x < w4; x++)
1477 pred = FFMIN(pred, refsegmap[idx_base + x]);
1479 av_assert1(pred < 8);
1485 memset(&s->above_segpred_ctx[col], 1, w4);
1486 memset(&s->left_segpred_ctx[row7], 1, h4);
1488 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1489 s->s.h.segmentation.prob);
1491 memset(&s->above_segpred_ctx[col], 0, w4);
1492 memset(&s->left_segpred_ctx[row7], 0, h4);
1494 if (s->s.h.segmentation.enabled &&
1495 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1496 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1497 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1500 b->skip = s->s.h.segmentation.enabled &&
1501 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1503 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1504 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1505 s->counts.skip[c][b->skip]++;
1508 if (s->s.h.keyframe || s->s.h.intraonly) {
1510 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1511 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1515 if (have_a && have_l) {
1516 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1519 c = have_a ? 2 * s->above_intra_ctx[col] :
1520 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1522 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1523 s->counts.intra[c][bit]++;
1527 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1531 c = (s->above_skip_ctx[col] ? max_tx :
1532 s->above_txfm_ctx[col]) +
1533 (s->left_skip_ctx[row7] ? max_tx :
1534 s->left_txfm_ctx[row7]) > max_tx;
1536 c = s->above_skip_ctx[col] ? 1 :
1537 (s->above_txfm_ctx[col] * 2 > max_tx);
1539 } else if (have_l) {
1540 c = s->left_skip_ctx[row7] ? 1 :
1541 (s->left_txfm_ctx[row7] * 2 > max_tx);
1547 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1549 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1551 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1553 s->counts.tx32p[c][b->tx]++;
1556 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1558 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1559 s->counts.tx16p[c][b->tx]++;
1562 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1563 s->counts.tx8p[c][b->tx]++;
1570 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1573 if (s->s.h.keyframe || s->s.h.intraonly) {
1574 uint8_t *a = &s->above_mode_ctx[col * 2];
1575 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1578 if (b->bs > BS_8x8) {
1579 // FIXME the memory storage intermediates here aren't really
1580 // necessary, they're just there to make the code slightly
1582 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1583 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1584 if (b->bs != BS_8x4) {
1585 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1586 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1587 l[0] = a[1] = b->mode[1];
1589 l[0] = a[1] = b->mode[1] = b->mode[0];
1591 if (b->bs != BS_4x8) {
1592 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1593 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1594 if (b->bs != BS_8x4) {
1595 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1596 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1597 l[1] = a[1] = b->mode[3];
1599 l[1] = a[1] = b->mode[3] = b->mode[2];
1602 b->mode[2] = b->mode[0];
1603 l[1] = a[1] = b->mode[3] = b->mode[1];
1606 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1607 vp9_default_kf_ymode_probs[*a][*l]);
1608 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1609 // FIXME this can probably be optimized
1610 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1611 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1613 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1614 vp9_default_kf_uvmode_probs[b->mode[3]]);
1615 } else if (b->intra) {
1617 if (b->bs > BS_8x8) {
1618 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1619 s->prob.p.y_mode[0]);
1620 s->counts.y_mode[0][b->mode[0]]++;
1621 if (b->bs != BS_8x4) {
1622 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1623 s->prob.p.y_mode[0]);
1624 s->counts.y_mode[0][b->mode[1]]++;
1626 b->mode[1] = b->mode[0];
1628 if (b->bs != BS_4x8) {
1629 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1630 s->prob.p.y_mode[0]);
1631 s->counts.y_mode[0][b->mode[2]]++;
1632 if (b->bs != BS_8x4) {
1633 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1634 s->prob.p.y_mode[0]);
1635 s->counts.y_mode[0][b->mode[3]]++;
1637 b->mode[3] = b->mode[2];
1640 b->mode[2] = b->mode[0];
1641 b->mode[3] = b->mode[1];
1644 static const uint8_t size_group[10] = {
1645 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1647 int sz = size_group[b->bs];
1649 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1650 s->prob.p.y_mode[sz]);
1651 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1652 s->counts.y_mode[sz][b->mode[3]]++;
1654 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1655 s->prob.p.uv_mode[b->mode[3]]);
1656 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1658 static const uint8_t inter_mode_ctx_lut[14][14] = {
1659 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1660 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1661 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1662 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1663 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1664 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1665 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1666 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1667 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1668 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1669 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1670 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1671 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1672 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1675 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1676 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1678 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1680 // read comp_pred flag
1681 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1682 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1686 // FIXME add intra as ref=0xff (or -1) to make these easier?
1689 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1691 } else if (s->above_comp_ctx[col]) {
1692 c = 2 + (s->left_intra_ctx[row7] ||
1693 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1694 } else if (s->left_comp_ctx[row7]) {
1695 c = 2 + (s->above_intra_ctx[col] ||
1696 s->above_ref_ctx[col] == s->s.h.fixcompref);
1698 c = (!s->above_intra_ctx[col] &&
1699 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1700 (!s->left_intra_ctx[row7] &&
1701 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1704 c = s->above_comp_ctx[col] ? 3 :
1705 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1707 } else if (have_l) {
1708 c = s->left_comp_ctx[row7] ? 3 :
1709 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1713 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1714 s->counts.comp[c][b->comp]++;
1717 // read actual references
1718 // FIXME probably cache a few variables here to prevent repetitive
1719 // memory accesses below
1720 if (b->comp) /* two references */ {
1721 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1723 b->ref[fix_idx] = s->s.h.fixcompref;
1724 // FIXME can this codeblob be replaced by some sort of LUT?
1727 if (s->above_intra_ctx[col]) {
1728 if (s->left_intra_ctx[row7]) {
1731 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1733 } else if (s->left_intra_ctx[row7]) {
1734 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1736 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1738 if (refl == refa && refa == s->s.h.varcompref[1]) {
1740 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1741 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1742 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1745 c = (refa == refl) ? 3 : 1;
1747 } else if (!s->left_comp_ctx[row7]) {
1748 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1751 c = (refl == s->s.h.varcompref[1] &&
1752 refa != s->s.h.varcompref[1]) ? 2 : 4;
1754 } else if (!s->above_comp_ctx[col]) {
1755 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1758 c = (refa == s->s.h.varcompref[1] &&
1759 refl != s->s.h.varcompref[1]) ? 2 : 4;
1762 c = (refl == refa) ? 4 : 2;
1766 if (s->above_intra_ctx[col]) {
1768 } else if (s->above_comp_ctx[col]) {
1769 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1771 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1774 } else if (have_l) {
1775 if (s->left_intra_ctx[row7]) {
1777 } else if (s->left_comp_ctx[row7]) {
1778 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1780 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1785 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1786 b->ref[var_idx] = s->s.h.varcompref[bit];
1787 s->counts.comp_ref[c][bit]++;
1788 } else /* single reference */ {
1791 if (have_a && !s->above_intra_ctx[col]) {
1792 if (have_l && !s->left_intra_ctx[row7]) {
1793 if (s->left_comp_ctx[row7]) {
1794 if (s->above_comp_ctx[col]) {
1795 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1796 !s->above_ref_ctx[col]);
1798 c = (3 * !s->above_ref_ctx[col]) +
1799 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1801 } else if (s->above_comp_ctx[col]) {
1802 c = (3 * !s->left_ref_ctx[row7]) +
1803 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1805 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1807 } else if (s->above_intra_ctx[col]) {
1809 } else if (s->above_comp_ctx[col]) {
1810 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1812 c = 4 * (!s->above_ref_ctx[col]);
1814 } else if (have_l && !s->left_intra_ctx[row7]) {
1815 if (s->left_intra_ctx[row7]) {
1817 } else if (s->left_comp_ctx[row7]) {
1818 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1820 c = 4 * (!s->left_ref_ctx[row7]);
1825 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1826 s->counts.single_ref[c][0][bit]++;
1830 // FIXME can this codeblob be replaced by some sort of LUT?
1833 if (s->left_intra_ctx[row7]) {
1834 if (s->above_intra_ctx[col]) {
1836 } else if (s->above_comp_ctx[col]) {
1837 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1838 s->above_ref_ctx[col] == 1);
1839 } else if (!s->above_ref_ctx[col]) {
1842 c = 4 * (s->above_ref_ctx[col] == 1);
1844 } else if (s->above_intra_ctx[col]) {
1845 if (s->left_intra_ctx[row7]) {
1847 } else if (s->left_comp_ctx[row7]) {
1848 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1849 s->left_ref_ctx[row7] == 1);
1850 } else if (!s->left_ref_ctx[row7]) {
1853 c = 4 * (s->left_ref_ctx[row7] == 1);
1855 } else if (s->above_comp_ctx[col]) {
1856 if (s->left_comp_ctx[row7]) {
1857 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1858 c = 3 * (s->s.h.fixcompref == 1 ||
1859 s->left_ref_ctx[row7] == 1);
1863 } else if (!s->left_ref_ctx[row7]) {
1864 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1865 s->above_ref_ctx[col] == 1);
1867 c = 3 * (s->left_ref_ctx[row7] == 1) +
1868 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1870 } else if (s->left_comp_ctx[row7]) {
1871 if (!s->above_ref_ctx[col]) {
1872 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1873 s->left_ref_ctx[row7] == 1);
1875 c = 3 * (s->above_ref_ctx[col] == 1) +
1876 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1878 } else if (!s->above_ref_ctx[col]) {
1879 if (!s->left_ref_ctx[row7]) {
1882 c = 4 * (s->left_ref_ctx[row7] == 1);
1884 } else if (!s->left_ref_ctx[row7]) {
1885 c = 4 * (s->above_ref_ctx[col] == 1);
1887 c = 2 * (s->left_ref_ctx[row7] == 1) +
1888 2 * (s->above_ref_ctx[col] == 1);
1891 if (s->above_intra_ctx[col] ||
1892 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1894 } else if (s->above_comp_ctx[col]) {
1895 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1897 c = 4 * (s->above_ref_ctx[col] == 1);
1900 } else if (have_l) {
1901 if (s->left_intra_ctx[row7] ||
1902 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1904 } else if (s->left_comp_ctx[row7]) {
1905 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1907 c = 4 * (s->left_ref_ctx[row7] == 1);
1912 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1913 s->counts.single_ref[c][1][bit]++;
1914 b->ref[0] = 1 + bit;
1919 if (b->bs <= BS_8x8) {
1920 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1921 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1923 static const uint8_t off[10] = {
1924 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1927 // FIXME this needs to use the LUT tables from find_ref_mvs
1928 // because not all are -1,0/0,-1
1929 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1930 [s->left_mode_ctx[row7 + off[b->bs]]];
1932 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1933 s->prob.p.mv_mode[c]);
1934 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1935 s->counts.mv_mode[c][b->mode[0] - 10]++;
1939 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1942 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1943 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1944 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1945 s->left_filter_ctx[row7] : 3;
1947 c = s->above_filter_ctx[col];
1949 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1950 c = s->left_filter_ctx[row7];
1955 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1956 s->prob.p.filter[c]);
1957 s->counts.filter[c][filter_id]++;
1958 b->filter = vp9_filter_lut[filter_id];
1960 b->filter = s->s.h.filtermode;
1963 if (b->bs > BS_8x8) {
1964 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1966 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1967 s->prob.p.mv_mode[c]);
1968 s->counts.mv_mode[c][b->mode[0] - 10]++;
1969 fill_mv(s, b->mv[0], b->mode[0], 0);
1971 if (b->bs != BS_8x4) {
1972 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1973 s->prob.p.mv_mode[c]);
1974 s->counts.mv_mode[c][b->mode[1] - 10]++;
1975 fill_mv(s, b->mv[1], b->mode[1], 1);
1977 b->mode[1] = b->mode[0];
1978 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1979 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1982 if (b->bs != BS_4x8) {
1983 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1984 s->prob.p.mv_mode[c]);
1985 s->counts.mv_mode[c][b->mode[2] - 10]++;
1986 fill_mv(s, b->mv[2], b->mode[2], 2);
1988 if (b->bs != BS_8x4) {
1989 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1990 s->prob.p.mv_mode[c]);
1991 s->counts.mv_mode[c][b->mode[3] - 10]++;
1992 fill_mv(s, b->mv[3], b->mode[3], 3);
1994 b->mode[3] = b->mode[2];
1995 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1996 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1999 b->mode[2] = b->mode[0];
2000 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2001 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2002 b->mode[3] = b->mode[1];
2003 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2004 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2007 fill_mv(s, b->mv[0], b->mode[0], -1);
2008 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2009 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2010 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2011 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2012 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2013 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2016 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2020 #define SPLAT_CTX(var, val, n) \
2022 case 1: var = val; break; \
2023 case 2: AV_WN16A(&var, val * 0x0101); break; \
2024 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2025 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2027 uint64_t v64 = val * 0x0101010101010101ULL; \
2028 AV_WN64A( &var, v64); \
2029 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2034 #define SPLAT_CTX(var, val, n) \
2036 case 1: var = val; break; \
2037 case 2: AV_WN16A(&var, val * 0x0101); break; \
2038 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2040 uint32_t v32 = val * 0x01010101; \
2041 AV_WN32A( &var, v32); \
2042 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2046 uint32_t v32 = val * 0x01010101; \
2047 AV_WN32A( &var, v32); \
2048 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2049 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2050 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2056 switch (bwh_tab[1][b->bs][0]) {
2057 #define SET_CTXS(dir, off, n) \
2059 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2060 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2061 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2062 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2063 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2064 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2065 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2067 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2068 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2069 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2074 case 1: SET_CTXS(above, col, 1); break;
2075 case 2: SET_CTXS(above, col, 2); break;
2076 case 4: SET_CTXS(above, col, 4); break;
2077 case 8: SET_CTXS(above, col, 8); break;
2079 switch (bwh_tab[1][b->bs][1]) {
2080 case 1: SET_CTXS(left, row7, 1); break;
2081 case 2: SET_CTXS(left, row7, 2); break;
2082 case 4: SET_CTXS(left, row7, 4); break;
2083 case 8: SET_CTXS(left, row7, 8); break;
2088 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2089 if (b->bs > BS_8x8) {
2090 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2092 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2093 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2094 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2095 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2096 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2097 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2098 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2099 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2101 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2103 for (n = 0; n < w4 * 2; n++) {
2104 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2105 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2107 for (n = 0; n < h4 * 2; n++) {
2108 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2109 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2115 for (y = 0; y < h4; y++) {
2116 int x, o = (row + y) * s->sb_cols * 8 + col;
2117 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2120 for (x = 0; x < w4; x++) {
2124 } else if (b->comp) {
2125 for (x = 0; x < w4; x++) {
2126 mv[x].ref[0] = b->ref[0];
2127 mv[x].ref[1] = b->ref[1];
2128 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2129 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2132 for (x = 0; x < w4; x++) {
2133 mv[x].ref[0] = b->ref[0];
2135 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2141 // FIXME merge cnt/eob arguments?
2142 static av_always_inline int
2143 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2144 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2145 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2146 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2147 const int16_t *band_counts, const int16_t *qmul)
2149 int i = 0, band = 0, band_left = band_counts[band];
2150 uint8_t *tp = p[0][nnz];
2151 uint8_t cache[1024];
2156 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2157 eob[band][nnz][val]++;
2162 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2163 cnt[band][nnz][0]++;
2165 band_left = band_counts[++band];
2167 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2169 if (++i == n_coeffs)
2170 break; //invalid input; blocks should end with EOB
2175 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2176 cnt[band][nnz][1]++;
2180 // fill in p[3-10] (model fill) - only once per frame for each pos
2182 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2184 cnt[band][nnz][2]++;
2185 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2186 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2187 cache[rc] = val = 2;
2189 val = 3 + vp56_rac_get_prob(c, tp[5]);
2192 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2194 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2195 val = 5 + vp56_rac_get_prob(c, 159);
2197 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2198 val += vp56_rac_get_prob(c, 145);
2202 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2203 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2204 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2205 val += (vp56_rac_get_prob(c, 148) << 1);
2206 val += vp56_rac_get_prob(c, 140);
2208 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2209 val += (vp56_rac_get_prob(c, 155) << 2);
2210 val += (vp56_rac_get_prob(c, 140) << 1);
2211 val += vp56_rac_get_prob(c, 135);
2213 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2214 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2215 val += (vp56_rac_get_prob(c, 157) << 3);
2216 val += (vp56_rac_get_prob(c, 141) << 2);
2217 val += (vp56_rac_get_prob(c, 134) << 1);
2218 val += vp56_rac_get_prob(c, 130);
2221 if (!is8bitsperpixel) {
2223 val += vp56_rac_get_prob(c, 255) << 17;
2224 val += vp56_rac_get_prob(c, 255) << 16;
2226 val += (vp56_rac_get_prob(c, 255) << 15);
2227 val += (vp56_rac_get_prob(c, 255) << 14);
2229 val += (vp56_rac_get_prob(c, 254) << 13);
2230 val += (vp56_rac_get_prob(c, 254) << 12);
2231 val += (vp56_rac_get_prob(c, 254) << 11);
2232 val += (vp56_rac_get_prob(c, 252) << 10);
2233 val += (vp56_rac_get_prob(c, 249) << 9);
2234 val += (vp56_rac_get_prob(c, 243) << 8);
2235 val += (vp56_rac_get_prob(c, 230) << 7);
2236 val += (vp56_rac_get_prob(c, 196) << 6);
2237 val += (vp56_rac_get_prob(c, 177) << 5);
2238 val += (vp56_rac_get_prob(c, 153) << 4);
2239 val += (vp56_rac_get_prob(c, 140) << 3);
2240 val += (vp56_rac_get_prob(c, 133) << 2);
2241 val += (vp56_rac_get_prob(c, 130) << 1);
2242 val += vp56_rac_get_prob(c, 129);
2246 #define STORE_COEF(c, i, v) do { \
2247 if (is8bitsperpixel) { \
2250 AV_WN32A(&c[i * 2], v); \
2254 band_left = band_counts[++band];
2256 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2258 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2259 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2261 } while (++i < n_coeffs);
2266 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2267 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2268 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2269 const int16_t (*nb)[2], const int16_t *band_counts,
2270 const int16_t *qmul)
2272 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2273 nnz, scan, nb, band_counts, qmul);
2276 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2277 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2278 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2279 const int16_t (*nb)[2], const int16_t *band_counts,
2280 const int16_t *qmul)
2282 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2283 nnz, scan, nb, band_counts, qmul);
2286 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2287 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2288 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2289 const int16_t (*nb)[2], const int16_t *band_counts,
2290 const int16_t *qmul)
2292 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->s.h.bpp, cnt, eob, p,
2293 nnz, scan, nb, band_counts, qmul);
2296 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2297 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2298 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2299 const int16_t (*nb)[2], const int16_t *band_counts,
2300 const int16_t *qmul)
2302 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->s.h.bpp, cnt, eob, p,
2303 nnz, scan, nb, band_counts, qmul);
2306 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2308 VP9Context *s = ctx->priv_data;
2310 int row = s->row, col = s->col;
2311 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2312 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2313 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2314 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2315 int end_x = FFMIN(2 * (s->cols - col), w4);
2316 int end_y = FFMIN(2 * (s->rows - row), h4);
2317 int n, pl, x, y, res;
2318 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2319 int tx = 4 * s->s.h.lossless + b->tx;
2320 const int16_t * const *yscans = vp9_scans[tx];
2321 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2322 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2323 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2324 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2325 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2326 static const int16_t band_counts[4][8] = {
2327 { 1, 2, 3, 4, 3, 16 - 13 },
2328 { 1, 2, 3, 4, 11, 64 - 21 },
2329 { 1, 2, 3, 4, 11, 256 - 21 },
2330 { 1, 2, 3, 4, 11, 1024 - 21 },
2332 const int16_t *y_band_counts = band_counts[b->tx];
2333 const int16_t *uv_band_counts = band_counts[b->uvtx];
2334 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2335 int total_coeff = 0;
2337 #define MERGE(la, end, step, rd) \
2338 for (n = 0; n < end; n += step) \
2339 la[n] = !!rd(&la[n])
2340 #define MERGE_CTX(step, rd) \
2342 MERGE(l, end_y, step, rd); \
2343 MERGE(a, end_x, step, rd); \
2346 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2347 for (n = 0, y = 0; y < end_y; y += step) { \
2348 for (x = 0; x < end_x; x += step, n += step * step) { \
2349 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2350 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2351 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2352 c, e, p, a[x] + l[y], yscans[txtp], \
2353 ynbs[txtp], y_band_counts, qmul[0]); \
2354 a[x] = l[y] = !!res; \
2355 total_coeff |= !!res; \
2357 AV_WN16A(&s->eob[n], res); \
2364 #define SPLAT(la, end, step, cond) \
2366 for (n = 1; n < end; n += step) \
2367 la[n] = la[n - 1]; \
2368 } else if (step == 4) { \
2370 for (n = 0; n < end; n += step) \
2371 AV_WN32A(&la[n], la[n] * 0x01010101); \
2373 for (n = 0; n < end; n += step) \
2374 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2376 } else /* step == 8 */ { \
2378 if (HAVE_FAST_64BIT) { \
2379 for (n = 0; n < end; n += step) \
2380 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2382 for (n = 0; n < end; n += step) { \
2383 uint32_t v32 = la[n] * 0x01010101; \
2384 AV_WN32A(&la[n], v32); \
2385 AV_WN32A(&la[n + 4], v32); \
2389 for (n = 0; n < end; n += step) \
2390 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2393 #define SPLAT_CTX(step) \
2395 SPLAT(a, end_x, step, end_x == w4); \
2396 SPLAT(l, end_y, step, end_y == h4); \
2402 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2405 MERGE_CTX(2, AV_RN16A);
2406 DECODE_Y_COEF_LOOP(2, 0,);
2410 MERGE_CTX(4, AV_RN32A);
2411 DECODE_Y_COEF_LOOP(4, 0,);
2415 MERGE_CTX(8, AV_RN64A);
2416 DECODE_Y_COEF_LOOP(8, 0, 32);
2421 #define DECODE_UV_COEF_LOOP(step, v) \
2422 for (n = 0, y = 0; y < end_y; y += step) { \
2423 for (x = 0; x < end_x; x += step, n += step * step) { \
2424 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2425 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2426 16 * step * step, c, e, p, a[x] + l[y], \
2427 uvscan, uvnb, uv_band_counts, qmul[1]); \
2428 a[x] = l[y] = !!res; \
2429 total_coeff |= !!res; \
2431 AV_WN16A(&s->uveob[pl][n], res); \
2433 s->uveob[pl][n] = res; \
2438 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2439 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2440 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2445 for (pl = 0; pl < 2; pl++) {
2446 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2447 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2450 DECODE_UV_COEF_LOOP(1,);
2453 MERGE_CTX(2, AV_RN16A);
2454 DECODE_UV_COEF_LOOP(2,);
2458 MERGE_CTX(4, AV_RN32A);
2459 DECODE_UV_COEF_LOOP(4,);
2463 MERGE_CTX(8, AV_RN64A);
2464 DECODE_UV_COEF_LOOP(8, 32);
2473 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2475 return decode_coeffs(ctx, 1);
2478 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2480 return decode_coeffs(ctx, 0);
2483 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2484 uint8_t *dst_edge, ptrdiff_t stride_edge,
2485 uint8_t *dst_inner, ptrdiff_t stride_inner,
2486 uint8_t *l, int col, int x, int w,
2487 int row, int y, enum TxfmMode tx,
2488 int p, int ss_h, int ss_v, int bytesperpixel)
2490 int have_top = row > 0 || y > 0;
2491 int have_left = col > s->tile_col_start || x > 0;
2492 int have_right = x < w - 1;
2493 int bpp = s->s.h.bpp;
2494 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2495 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2496 { DC_127_PRED, VERT_PRED } },
2497 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2498 { HOR_PRED, HOR_PRED } },
2499 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2500 { LEFT_DC_PRED, DC_PRED } },
2501 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2502 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2503 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2504 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2505 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2506 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2507 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2508 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2509 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2510 { DC_127_PRED, VERT_LEFT_PRED } },
2511 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2512 { HOR_UP_PRED, HOR_UP_PRED } },
2513 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2514 { HOR_PRED, TM_VP8_PRED } },
2516 static const struct {
2517 uint8_t needs_left:1;
2518 uint8_t needs_top:1;
2519 uint8_t needs_topleft:1;
2520 uint8_t needs_topright:1;
2521 uint8_t invert_left:1;
2522 } edges[N_INTRA_PRED_MODES] = {
2523 [VERT_PRED] = { .needs_top = 1 },
2524 [HOR_PRED] = { .needs_left = 1 },
2525 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2526 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2527 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2528 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2529 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2530 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2531 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2532 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2533 [LEFT_DC_PRED] = { .needs_left = 1 },
2534 [TOP_DC_PRED] = { .needs_top = 1 },
2535 [DC_128_PRED] = { 0 },
2536 [DC_127_PRED] = { 0 },
2537 [DC_129_PRED] = { 0 }
2540 av_assert2(mode >= 0 && mode < 10);
2541 mode = mode_conv[mode][have_left][have_top];
2542 if (edges[mode].needs_top) {
2543 uint8_t *top, *topleft;
2544 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2545 int n_px_need_tr = 0;
2547 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2550 // if top of sb64-row, use s->intra_pred_data[] instead of
2551 // dst[-stride] for intra prediction (it contains pre- instead of
2552 // post-loopfilter data)
2554 top = !(row & 7) && !y ?
2555 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2556 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2558 topleft = !(row & 7) && !y ?
2559 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2560 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2561 &dst_inner[-stride_inner];
2565 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2566 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2567 n_px_need + n_px_need_tr <= n_px_have) {
2571 if (n_px_need <= n_px_have) {
2572 memcpy(*a, top, n_px_need * bytesperpixel);
2574 #define memset_bpp(c, i1, v, i2, num) do { \
2575 if (bytesperpixel == 1) { \
2576 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2578 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2579 for (n = 0; n < (num); n++) { \
2580 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2584 memcpy(*a, top, n_px_have * bytesperpixel);
2585 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2588 #define memset_val(c, val, num) do { \
2589 if (bytesperpixel == 1) { \
2590 memset((c), (val), (num)); \
2593 for (n = 0; n < (num); n++) { \
2594 AV_WN16A(&(c)[n * 2], (val)); \
2598 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2600 if (edges[mode].needs_topleft) {
2601 if (have_left && have_top) {
2602 #define assign_bpp(c, i1, v, i2) do { \
2603 if (bytesperpixel == 1) { \
2604 (c)[(i1)] = (v)[(i2)]; \
2606 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2609 assign_bpp(*a, -1, topleft, -1);
2611 #define assign_val(c, i, v) do { \
2612 if (bytesperpixel == 1) { \
2615 AV_WN16A(&(c)[(i) * 2], (v)); \
2618 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2621 if (tx == TX_4X4 && edges[mode].needs_topright) {
2622 if (have_top && have_right &&
2623 n_px_need + n_px_need_tr <= n_px_have) {
2624 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2626 memset_bpp(*a, 4, *a, 3, 4);
2631 if (edges[mode].needs_left) {
2633 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2634 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2635 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2637 if (edges[mode].invert_left) {
2638 if (n_px_need <= n_px_have) {
2639 for (i = 0; i < n_px_need; i++)
2640 assign_bpp(l, i, &dst[i * stride], -1);
2642 for (i = 0; i < n_px_have; i++)
2643 assign_bpp(l, i, &dst[i * stride], -1);
2644 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2647 if (n_px_need <= n_px_have) {
2648 for (i = 0; i < n_px_need; i++)
2649 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2651 for (i = 0; i < n_px_have; i++)
2652 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2653 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2657 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2664 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2665 ptrdiff_t uv_off, int bytesperpixel)
2667 VP9Context *s = ctx->priv_data;
2669 int row = s->row, col = s->col;
2670 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2671 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2672 int end_x = FFMIN(2 * (s->cols - col), w4);
2673 int end_y = FFMIN(2 * (s->rows - row), h4);
2674 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2675 int uvstep1d = 1 << b->uvtx, p;
2676 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2677 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2678 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2680 for (n = 0, y = 0; y < end_y; y += step1d) {
2681 uint8_t *ptr = dst, *ptr_r = dst_r;
2682 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2683 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2684 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2686 uint8_t *a = &a_buf[32];
2687 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2688 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2690 mode = check_intra_mode(s, mode, &a, ptr_r,
2691 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2692 ptr, s->y_stride, l,
2693 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2694 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2696 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2697 s->block + 16 * n * bytesperpixel, eob);
2699 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2700 dst += 4 * step1d * s->y_stride;
2707 step = 1 << (b->uvtx * 2);
2708 for (p = 0; p < 2; p++) {
2709 dst = s->dst[1 + p];
2710 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2711 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2712 uint8_t *ptr = dst, *ptr_r = dst_r;
2713 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2714 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2715 int mode = b->uvmode;
2716 uint8_t *a = &a_buf[32];
2717 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2719 mode = check_intra_mode(s, mode, &a, ptr_r,
2720 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2721 ptr, s->uv_stride, l, col, x, w4, row, y,
2722 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2723 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2725 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2726 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2728 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2729 dst += 4 * uvstep1d * s->uv_stride;
2734 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2736 intra_recon(ctx, y_off, uv_off, 1);
2739 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2741 intra_recon(ctx, y_off, uv_off, 2);
2744 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2745 uint8_t *dst, ptrdiff_t dst_stride,
2746 const uint8_t *ref, ptrdiff_t ref_stride,
2747 ThreadFrame *ref_frame,
2748 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2749 int bw, int bh, int w, int h, int bytesperpixel)
2751 int mx = mv->x, my = mv->y, th;
2755 ref += y * ref_stride + x * bytesperpixel;
2758 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2759 // we use +7 because the last 7 pixels of each sbrow can be changed in
2760 // the longest loopfilter of the next sbrow
2761 th = (y + bh + 4 * !!my + 7) >> 6;
2762 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2763 // The arm/aarch64 _hv filters read one more row than what actually is
2764 // needed, so switch to emulated edge one pixel sooner vertically
2765 // (!!my * 5) than horizontally (!!mx * 4).
2766 if (x < !!mx * 3 || y < !!my * 3 ||
2767 x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
2768 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2769 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2771 bw + !!mx * 7, bh + !!my * 7,
2772 x - !!mx * 3, y - !!my * 3, w, h);
2773 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2776 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2779 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2780 uint8_t *dst_u, uint8_t *dst_v,
2781 ptrdiff_t dst_stride,
2782 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2783 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2784 ThreadFrame *ref_frame,
2785 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2786 int bw, int bh, int w, int h, int bytesperpixel)
2788 int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
2792 ref_u += y * src_stride_u + x * bytesperpixel;
2793 ref_v += y * src_stride_v + x * bytesperpixel;
2796 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2797 // we use +7 because the last 7 pixels of each sbrow can be changed in
2798 // the longest loopfilter of the next sbrow
2799 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2800 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2801 // The arm/aarch64 _hv filters read one more row than what actually is
2802 // needed, so switch to emulated edge one pixel sooner vertically
2803 // (!!my * 5) than horizontally (!!mx * 4).
2804 if (x < !!mx * 3 || y < !!my * 3 ||
2805 x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
2806 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2807 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2809 bw + !!mx * 7, bh + !!my * 7,
2810 x - !!mx * 3, y - !!my * 3, w, h);
2811 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2812 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2814 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2815 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2817 bw + !!mx * 7, bh + !!my * 7,
2818 x - !!mx * 3, y - !!my * 3, w, h);
2819 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2820 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2822 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2823 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2827 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2828 px, py, pw, ph, bw, bh, w, h, i) \
2829 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2830 mv, bw, bh, w, h, bytesperpixel)
2831 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2832 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2833 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2834 row, col, mv, bw, bh, w, h, bytesperpixel)
2836 #define FN(x) x##_8bpp
2837 #define BYTES_PER_PIXEL 1
2838 #include "vp9_mc_template.c"
2840 #undef BYTES_PER_PIXEL
2841 #define FN(x) x##_16bpp
2842 #define BYTES_PER_PIXEL 2
2843 #include "vp9_mc_template.c"
2845 #undef mc_chroma_dir
2847 #undef BYTES_PER_PIXEL
2850 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2851 vp9_mc_func (*mc)[2],
2852 uint8_t *dst, ptrdiff_t dst_stride,
2853 const uint8_t *ref, ptrdiff_t ref_stride,
2854 ThreadFrame *ref_frame,
2855 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2856 int px, int py, int pw, int ph,
2857 int bw, int bh, int w, int h, int bytesperpixel,
2858 const uint16_t *scale, const uint8_t *step)
2860 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2861 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2862 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2863 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2865 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2867 int refbw_m1, refbh_m1;
2871 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2872 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2873 // BUG libvpx seems to scale the two components separately. This introduces
2874 // rounding errors but we have to reproduce them to be exactly compatible
2875 // with the output from libvpx...
2876 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2877 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2881 ref += y * ref_stride + x * bytesperpixel;
2884 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2885 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2886 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2887 // we use +7 because the last 7 pixels of each sbrow can be changed in
2888 // the longest loopfilter of the next sbrow
2889 th = (y + refbh_m1 + 4 + 7) >> 6;
2890 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2891 // The arm/aarch64 _hv filters read one more row than what actually is
2892 // needed, so switch to emulated edge one pixel sooner vertically
2893 // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
2894 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
2895 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2896 ref - 3 * ref_stride - 3 * bytesperpixel,
2898 refbw_m1 + 8, refbh_m1 + 8,
2899 x - 3, y - 3, w, h);
2900 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2903 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2907 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2908 vp9_mc_func (*mc)[2],
2909 uint8_t *dst_u, uint8_t *dst_v,
2910 ptrdiff_t dst_stride,
2911 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2912 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2913 ThreadFrame *ref_frame,
2914 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2915 int px, int py, int pw, int ph,
2916 int bw, int bh, int w, int h, int bytesperpixel,
2917 const uint16_t *scale, const uint8_t *step)
2919 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2920 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2921 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2922 ref_v, src_stride_v, ref_frame,
2923 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2926 int refbw_m1, refbh_m1;
2931 // BUG https://code.google.com/p/webm/issues/detail?id=820
2932 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
2933 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2935 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2936 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2939 // BUG https://code.google.com/p/webm/issues/detail?id=820
2940 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
2941 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2943 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2944 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2949 ref_u += y * src_stride_u + x * bytesperpixel;
2950 ref_v += y * src_stride_v + x * bytesperpixel;
2953 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2954 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2955 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2956 // we use +7 because the last 7 pixels of each sbrow can be changed in
2957 // the longest loopfilter of the next sbrow
2958 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2959 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2960 // The arm/aarch64 _hv filters read one more row than what actually is
2961 // needed, so switch to emulated edge one pixel sooner vertically
2962 // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
2963 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
2964 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2965 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2967 refbw_m1 + 8, refbh_m1 + 8,
2968 x - 3, y - 3, w, h);
2969 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2970 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2972 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2973 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2975 refbw_m1 + 8, refbh_m1 + 8,
2976 x - 3, y - 3, w, h);
2977 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2978 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2980 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2981 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2986 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2987 px, py, pw, ph, bw, bh, w, h, i) \
2988 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2989 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2990 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2991 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2992 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2993 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2994 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2995 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2997 #define FN(x) x##_scaled_8bpp
2998 #define BYTES_PER_PIXEL 1
2999 #include "vp9_mc_template.c"
3001 #undef BYTES_PER_PIXEL
3002 #define FN(x) x##_scaled_16bpp
3003 #define BYTES_PER_PIXEL 2
3004 #include "vp9_mc_template.c"
3006 #undef mc_chroma_dir
3008 #undef BYTES_PER_PIXEL
3011 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3013 VP9Context *s = ctx->priv_data;
3015 int row = s->row, col = s->col;
3017 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3018 if (bytesperpixel == 1) {
3019 inter_pred_scaled_8bpp(ctx);
3021 inter_pred_scaled_16bpp(ctx);
3024 if (bytesperpixel == 1) {
3025 inter_pred_8bpp(ctx);
3027 inter_pred_16bpp(ctx);
3031 /* mostly copied intra_recon() */
3033 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3034 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3035 int end_x = FFMIN(2 * (s->cols - col), w4);
3036 int end_y = FFMIN(2 * (s->rows - row), h4);
3037 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3038 int uvstep1d = 1 << b->uvtx, p;
3039 uint8_t *dst = s->dst[0];
3042 for (n = 0, y = 0; y < end_y; y += step1d) {
3044 for (x = 0; x < end_x; x += step1d,
3045 ptr += 4 * step1d * bytesperpixel, n += step) {
3046 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3049 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3050 s->block + 16 * n * bytesperpixel, eob);
3052 dst += 4 * s->y_stride * step1d;
3058 step = 1 << (b->uvtx * 2);
3059 for (p = 0; p < 2; p++) {
3060 dst = s->dst[p + 1];
3061 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3063 for (x = 0; x < end_x; x += uvstep1d,
3064 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3065 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3068 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3069 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3071 dst += 4 * uvstep1d * s->uv_stride;
3077 static void inter_recon_8bpp(AVCodecContext *ctx)
3079 inter_recon(ctx, 1);
3082 static void inter_recon_16bpp(AVCodecContext *ctx)
3084 inter_recon(ctx, 2);
3087 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3088 int row_and_7, int col_and_7,
3089 int w, int h, int col_end, int row_end,
3090 enum TxfmMode tx, int skip_inter)
3092 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3093 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3095 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3096 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3097 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3098 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3100 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3101 // edges. This means that for UV, we work on two subsampled blocks at
3102 // a time, and we only use the topleft block's mode information to set
3103 // things like block strength. Thus, for any block size smaller than
3104 // 16x16, ignore the odd portion of the block.
3105 if (tx == TX_4X4 && (ss_v | ss_h)) {
3120 if (tx == TX_4X4 && !skip_inter) {
3121 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3122 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3123 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3125 for (y = row_and_7; y < h + row_and_7; y++) {
3126 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3128 mask[0][y][1] |= m_row_8;
3129 mask[0][y][2] |= m_row_4;
3130 // for odd lines, if the odd col is not being filtered,
3131 // skip odd row also:
3138 // if a/c are even row/col and b/d are odd, and d is skipped,
3139 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3140 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3141 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3143 mask[1][y][col_mask_id] |= m_col;
3146 mask[0][y][3] |= m_col;
3148 if (ss_h && (col_end & 1))
3149 mask[1][y][3] |= (t << (w - 1)) - t;
3151 mask[1][y][3] |= m_col;
3155 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3158 int mask_id = (tx == TX_8X8);
3159 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3160 int l2 = tx + ss_h - 1, step1d;
3161 int m_row = m_col & masks[l2];
3163 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3164 // 8wd loopfilter to prevent going off the visible edge.
3165 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3166 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3167 int m_row_8 = m_row - m_row_16;
3169 for (y = row_and_7; y < h + row_and_7; y++) {
3170 mask[0][y][0] |= m_row_16;
3171 mask[0][y][1] |= m_row_8;
3174 for (y = row_and_7; y < h + row_and_7; y++)
3175 mask[0][y][mask_id] |= m_row;
3180 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3181 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3182 mask[1][y][0] |= m_col;
3183 if (y - row_and_7 == h - 1)
3184 mask[1][y][1] |= m_col;
3186 for (y = row_and_7; y < h + row_and_7; y += step1d)
3187 mask[1][y][mask_id] |= m_col;
3189 } else if (tx != TX_4X4) {
3192 mask_id = (tx == TX_8X8) || (h == ss_v);
3193 mask[1][row_and_7][mask_id] |= m_col;
3194 mask_id = (tx == TX_8X8) || (w == ss_h);
3195 for (y = row_and_7; y < h + row_and_7; y++)
3196 mask[0][y][mask_id] |= t;
3198 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3200 for (y = row_and_7; y < h + row_and_7; y++) {
3201 mask[0][y][2] |= t4;
3202 mask[0][y][1] |= t8;
3204 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3209 static void decode_b(AVCodecContext *ctx, int row, int col,
3210 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3211 enum BlockLevel bl, enum BlockPartition bp)
3213 VP9Context *s = ctx->priv_data;
3215 enum BlockSize bs = bl * 3 + bp;
3216 int bytesperpixel = s->bytesperpixel;
3217 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3219 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3225 s->min_mv.x = -(128 + col * 64);
3226 s->min_mv.y = -(128 + row * 64);
3227 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3228 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3234 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3235 (s->ss_v && h4 * 2 == (1 << b->tx)));
3240 if (bytesperpixel == 1) {
3241 has_coeffs = decode_coeffs_8bpp(ctx);
3243 has_coeffs = decode_coeffs_16bpp(ctx);
3245 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3247 memset(&s->above_skip_ctx[col], 1, w4);
3248 memset(&s->left_skip_ctx[s->row7], 1, h4);
3253 #define SPLAT_ZERO_CTX(v, n) \
3255 case 1: v = 0; break; \
3256 case 2: AV_ZERO16(&v); break; \
3257 case 4: AV_ZERO32(&v); break; \
3258 case 8: AV_ZERO64(&v); break; \
3259 case 16: AV_ZERO128(&v); break; \
3261 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3263 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3264 if (s->ss_##dir2) { \
3265 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3266 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3268 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3269 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3274 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3275 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3276 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3277 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3280 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3281 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3282 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3283 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3289 s->block += w4 * h4 * 64 * bytesperpixel;
3290 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3291 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3292 s->eob += 4 * w4 * h4;
3293 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3294 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3300 // emulated overhangs if the stride of the target buffer can't hold. This
3301 // makes it possible to support emu-edge and so on even if we have large block
3303 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3304 (row + h4) > s->rows;
3305 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3306 (row + h4) > s->rows;
3308 s->dst[0] = s->tmp_y;
3311 s->dst[0] = f->data[0] + yoff;
3312 s->y_stride = f->linesize[0];
3315 s->dst[1] = s->tmp_uv[0];
3316 s->dst[2] = s->tmp_uv[1];
3319 s->dst[1] = f->data[1] + uvoff;
3320 s->dst[2] = f->data[2] + uvoff;
3321 s->uv_stride = f->linesize[1];
3324 if (s->s.h.bpp > 8) {
3325 intra_recon_16bpp(ctx, yoff, uvoff);
3327 intra_recon_8bpp(ctx, yoff, uvoff);
3330 if (s->s.h.bpp > 8) {
3331 inter_recon_16bpp(ctx);
3333 inter_recon_8bpp(ctx);
3337 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3339 for (n = 0; o < w; n++) {
3344 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3345 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3351 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3352 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3354 for (n = s->ss_h; o < w; n++) {
3359 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3360 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3361 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3362 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3368 // pick filter level and find edges to apply filter to
3369 if (s->s.h.filter.level &&
3370 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3371 [b->mode[3] != ZEROMV]) > 0) {
3372 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3373 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3375 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3376 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3377 if (s->ss_h || s->ss_v)
3378 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3379 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3380 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3381 b->uvtx, skip_inter);
3383 if (!s->filter_lut.lim_lut[lvl]) {
3384 int sharp = s->s.h.filter.sharpness;
3388 limit >>= (sharp + 3) >> 2;
3389 limit = FFMIN(limit, 9 - sharp);
3391 limit = FFMAX(limit, 1);
3393 s->filter_lut.lim_lut[lvl] = limit;
3394 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3400 s->block += w4 * h4 * 64 * bytesperpixel;
3401 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3402 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3403 s->eob += 4 * w4 * h4;
3404 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3405 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3409 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3410 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3412 VP9Context *s = ctx->priv_data;
3413 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3414 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3415 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3416 s->prob.p.partition[bl][c];
3417 enum BlockPartition bp;
3418 ptrdiff_t hbs = 4 >> bl;
3419 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3420 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3421 int bytesperpixel = s->bytesperpixel;
3424 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3425 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3426 } else if (col + hbs < s->cols) { // FIXME why not <=?
3427 if (row + hbs < s->rows) { // FIXME why not <=?
3428 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3430 case PARTITION_NONE:
3431 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3434 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3435 yoff += hbs * 8 * y_stride;
3436 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3437 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3440 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3441 yoff += hbs * 8 * bytesperpixel;
3442 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3443 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3445 case PARTITION_SPLIT:
3446 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3447 decode_sb(ctx, row, col + hbs, lflvl,
3448 yoff + 8 * hbs * bytesperpixel,
3449 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3450 yoff += hbs * 8 * y_stride;
3451 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3452 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3453 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3454 yoff + 8 * hbs * bytesperpixel,
3455 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3460 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3461 bp = PARTITION_SPLIT;
3462 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3463 decode_sb(ctx, row, col + hbs, lflvl,
3464 yoff + 8 * hbs * bytesperpixel,
3465 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3468 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3470 } else if (row + hbs < s->rows) { // FIXME why not <=?
3471 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3472 bp = PARTITION_SPLIT;
3473 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3474 yoff += hbs * 8 * y_stride;
3475 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3476 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3479 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3482 bp = PARTITION_SPLIT;
3483 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3485 s->counts.partition[bl][c][bp]++;
3488 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3489 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3491 VP9Context *s = ctx->priv_data;
3493 ptrdiff_t hbs = 4 >> bl;
3494 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3495 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3496 int bytesperpixel = s->bytesperpixel;
3499 av_assert2(b->bl == BL_8X8);
3500 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3501 } else if (s->b->bl == bl) {
3502 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3503 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3504 yoff += hbs * 8 * y_stride;
3505 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3506 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3507 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3508 yoff += hbs * 8 * bytesperpixel;
3509 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3510 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3513 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3514 if (col + hbs < s->cols) { // FIXME why not <=?
3515 if (row + hbs < s->rows) {
3516 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3517 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3518 yoff += hbs * 8 * y_stride;
3519 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3520 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3521 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3522 yoff + 8 * hbs * bytesperpixel,
3523 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3525 yoff += hbs * 8 * bytesperpixel;
3526 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3527 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3529 } else if (row + hbs < s->rows) {
3530 yoff += hbs * 8 * y_stride;
3531 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3532 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3537 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3538 uint8_t *lvl, uint8_t (*mask)[4],
3539 uint8_t *dst, ptrdiff_t ls)
3541 int y, x, bytesperpixel = s->bytesperpixel;
3543 // filter edges between columns (e.g. block1 | block2)
3544 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3545 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3546 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3547 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3548 unsigned hm = hm1 | hm2 | hm13 | hm23;
3550 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3553 int L = *l, H = L >> 4;
3554 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3556 if (hmask1[0] & x) {
3557 if (hmask2[0] & x) {
3558 av_assert2(l[8 << ss_v] == L);
3559 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3561 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3563 } else if (hm2 & x) {
3566 E |= s->filter_lut.mblim_lut[L] << 8;
3567 I |= s->filter_lut.lim_lut[L] << 8;
3568 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3570 [0](ptr, ls, E, I, H);
3572 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3573 [0](ptr, ls, E, I, H);
3575 } else if (hm2 & x) {
3576 int L = l[8 << ss_v], H = L >> 4;
3577 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3579 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3580 [0](ptr + 8 * ls, ls, E, I, H);
3588 int L = *l, H = L >> 4;
3589 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3594 E |= s->filter_lut.mblim_lut[L] << 8;
3595 I |= s->filter_lut.lim_lut[L] << 8;
3596 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3598 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3600 } else if (hm23 & x) {
3601 int L = l[8 << ss_v], H = L >> 4;
3602 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3604 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3612 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3613 uint8_t *lvl, uint8_t (*mask)[4],
3614 uint8_t *dst, ptrdiff_t ls)
3616 int y, x, bytesperpixel = s->bytesperpixel;
3619 // filter edges between rows (e.g. ------)
3621 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3622 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3623 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3625 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3628 int L = *l, H = L >> 4;
3629 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3632 if (vmask[0] & (x << (1 + ss_h))) {
3633 av_assert2(l[1 + ss_h] == L);
3634 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3636 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3638 } else if (vm & (x << (1 + ss_h))) {
3641 E |= s->filter_lut.mblim_lut[L] << 8;
3642 I |= s->filter_lut.lim_lut[L] << 8;
3643 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3644 [!!(vmask[1] & (x << (1 + ss_h)))]
3645 [1](ptr, ls, E, I, H);
3647 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3648 [1](ptr, ls, E, I, H);
3650 } else if (vm & (x << (1 + ss_h))) {
3651 int L = l[1 + ss_h], H = L >> 4;
3652 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3654 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3655 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3660 int L = *l, H = L >> 4;
3661 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3663 if (vm3 & (x << (1 + ss_h))) {
3666 E |= s->filter_lut.mblim_lut[L] << 8;
3667 I |= s->filter_lut.lim_lut[L] << 8;
3668 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3670 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3672 } else if (vm3 & (x << (1 + ss_h))) {
3673 int L = l[1 + ss_h], H = L >> 4;
3674 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3676 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3689 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3690 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3692 VP9Context *s = ctx->priv_data;
3693 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3694 uint8_t *dst = f->data[0] + yoff;
3695 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3696 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3699 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3700 // if you think of them as acting on a 8x8 block max, we can interleave
3701 // each v/h within the single x loop, but that only works if we work on
3702 // 8 pixel blocks, and we won't always do that (we want at least 16px
3703 // to use SSE2 optimizations, perhaps 32 for AVX2)
3705 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3706 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3708 for (p = 0; p < 2; p++) {
3709 dst = f->data[1 + p] + uvoff;
3710 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3711 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3715 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3717 int sb_start = ( idx * n) >> log2_n;
3718 int sb_end = ((idx + 1) * n) >> log2_n;
3719 *start = FFMIN(sb_start, n) << 3;
3720 *end = FFMIN(sb_end, n) << 3;
3723 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3724 int max_count, int update_factor)
3726 unsigned ct = ct0 + ct1, p2, p1;
3731 update_factor = FASTDIV(update_factor * FFMIN(ct, max_count), max_count);
3733 p2 = ((((int64_t) ct0) << 8) + (ct >> 1)) / ct;
3734 p2 = av_clip(p2, 1, 255);
3736 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3737 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3740 static void adapt_probs(VP9Context *s)
3743 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3744 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3747 for (i = 0; i < 4; i++)
3748 for (j = 0; j < 2; j++)
3749 for (k = 0; k < 2; k++)
3750 for (l = 0; l < 6; l++)
3751 for (m = 0; m < 6; m++) {
3752 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3753 unsigned *e = s->counts.eob[i][j][k][l][m];
3754 unsigned *c = s->counts.coef[i][j][k][l][m];
3756 if (l == 0 && m >= 3) // dc only has 3 pt
3759 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3760 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3761 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3764 if (s->s.h.keyframe || s->s.h.intraonly) {
3765 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3766 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3767 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3768 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3773 for (i = 0; i < 3; i++)
3774 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3777 for (i = 0; i < 4; i++)
3778 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3781 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3782 for (i = 0; i < 5; i++)
3783 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3787 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3788 for (i = 0; i < 5; i++)
3789 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3790 s->counts.comp_ref[i][1], 20, 128);
3793 if (s->s.h.comppredmode != PRED_COMPREF) {
3794 for (i = 0; i < 5; i++) {
3795 uint8_t *pp = p->single_ref[i];
3796 unsigned (*c)[2] = s->counts.single_ref[i];
3798 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3799 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3803 // block partitioning
3804 for (i = 0; i < 4; i++)
3805 for (j = 0; j < 4; j++) {
3806 uint8_t *pp = p->partition[i][j];
3807 unsigned *c = s->counts.partition[i][j];
3809 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3810 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3811 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3815 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3816 for (i = 0; i < 2; i++) {
3817 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3819 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3820 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3821 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3822 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3823 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3824 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3828 // interpolation filter
3829 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3830 for (i = 0; i < 4; i++) {
3831 uint8_t *pp = p->filter[i];
3832 unsigned *c = s->counts.filter[i];
3834 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3835 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3840 for (i = 0; i < 7; i++) {
3841 uint8_t *pp = p->mv_mode[i];
3842 unsigned *c = s->counts.mv_mode[i];
3844 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3845 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3846 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3851 uint8_t *pp = p->mv_joint;
3852 unsigned *c = s->counts.mv_joint;
3854 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3855 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3856 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3860 for (i = 0; i < 2; i++) {
3862 unsigned *c, (*c2)[2], sum;
3864 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3865 s->counts.mv_comp[i].sign[1], 20, 128);
3867 pp = p->mv_comp[i].classes;
3868 c = s->counts.mv_comp[i].classes;
3869 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3870 adapt_prob(&pp[0], c[0], sum, 20, 128);
3872 adapt_prob(&pp[1], c[1], sum, 20, 128);
3874 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3875 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3877 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3878 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3880 adapt_prob(&pp[6], c[6], sum, 20, 128);
3881 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3882 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3883 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3885 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3886 s->counts.mv_comp[i].class0[1], 20, 128);
3887 pp = p->mv_comp[i].bits;
3888 c2 = s->counts.mv_comp[i].bits;
3889 for (j = 0; j < 10; j++)
3890 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3892 for (j = 0; j < 2; j++) {
3893 pp = p->mv_comp[i].class0_fp[j];
3894 c = s->counts.mv_comp[i].class0_fp[j];
3895 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3896 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3897 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3899 pp = p->mv_comp[i].fp;
3900 c = s->counts.mv_comp[i].fp;
3901 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3902 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3903 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3905 if (s->s.h.highprecisionmvs) {
3906 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3907 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3908 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3909 s->counts.mv_comp[i].hp[1], 20, 128);
3914 for (i = 0; i < 4; i++) {
3915 uint8_t *pp = p->y_mode[i];
3916 unsigned *c = s->counts.y_mode[i], sum, s2;
3918 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3919 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3920 sum -= c[TM_VP8_PRED];
3921 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3922 sum -= c[VERT_PRED];
3923 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3924 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3926 adapt_prob(&pp[3], s2, sum, 20, 128);
3928 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3929 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3930 sum -= c[DIAG_DOWN_LEFT_PRED];
3931 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3932 sum -= c[VERT_LEFT_PRED];
3933 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3934 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3938 for (i = 0; i < 10; i++) {
3939 uint8_t *pp = p->uv_mode[i];
3940 unsigned *c = s->counts.uv_mode[i], sum, s2;
3942 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3943 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3944 sum -= c[TM_VP8_PRED];
3945 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3946 sum -= c[VERT_PRED];
3947 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3948 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3950 adapt_prob(&pp[3], s2, sum, 20, 128);
3952 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3953 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3954 sum -= c[DIAG_DOWN_LEFT_PRED];
3955 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3956 sum -= c[VERT_LEFT_PRED];
3957 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3958 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3962 static void free_buffers(VP9Context *s)
3964 av_freep(&s->intra_pred_data[0]);
3965 av_freep(&s->b_base);
3966 av_freep(&s->block_base);
3969 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3971 VP9Context *s = ctx->priv_data;
3974 for (i = 0; i < 3; i++) {
3975 if (s->s.frames[i].tf.f->buf[0])
3976 vp9_unref_frame(ctx, &s->s.frames[i]);
3977 av_frame_free(&s->s.frames[i].tf.f);
3979 for (i = 0; i < 8; i++) {
3980 if (s->s.refs[i].f->buf[0])
3981 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3982 av_frame_free(&s->s.refs[i].f);
3983 if (s->next_refs[i].f->buf[0])
3984 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3985 av_frame_free(&s->next_refs[i].f);
3995 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3996 int *got_frame, AVPacket *pkt)
3998 const uint8_t *data = pkt->data;
3999 int size = pkt->size;
4000 VP9Context *s = ctx->priv_data;
4001 int res, tile_row, tile_col, i, ref, row, col;
4002 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
4003 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
4004 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
4008 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
4010 } else if (res == 0) {
4011 if (!s->s.refs[ref].f->buf[0]) {
4012 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4013 return AVERROR_INVALIDDATA;
4015 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
4017 ((AVFrame *)frame)->pts = pkt->pts;
4019 FF_DISABLE_DEPRECATION_WARNINGS
4020 ((AVFrame *)frame)->pkt_pts = pkt->pts;
4021 FF_ENABLE_DEPRECATION_WARNINGS
4023 ((AVFrame *)frame)->pkt_dts = pkt->dts;
4024 for (i = 0; i < 8; i++) {
4025 if (s->next_refs[i].f->buf[0])
4026 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4027 if (s->s.refs[i].f->buf[0] &&
4028 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4037 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4038 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4039 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4040 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4041 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4044 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4045 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
4046 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4047 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4049 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4050 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4051 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4053 f = s->s.frames[CUR_FRAME].tf.f;
4054 f->key_frame = s->s.h.keyframe;
4055 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4056 ls_y = f->linesize[0];
4057 ls_uv =f->linesize[1];
4059 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4060 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4061 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4062 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4066 for (i = 0; i < 8; i++) {
4067 if (s->next_refs[i].f->buf[0])
4068 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4069 if (s->s.h.refreshrefmask & (1 << i)) {
4070 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4071 } else if (s->s.refs[i].f->buf[0]) {
4072 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4079 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4082 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4085 res = ctx->hwaccel->end_frame(ctx);
4091 // main tile decode loop
4092 bytesperpixel = s->bytesperpixel;
4093 memset(s->above_partition_ctx, 0, s->cols);
4094 memset(s->above_skip_ctx, 0, s->cols);
4095 if (s->s.h.keyframe || s->s.h.intraonly) {
4096 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4098 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4100 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4101 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4102 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4103 memset(s->above_segpred_ctx, 0, s->cols);
4104 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4105 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4106 if ((res = update_block_buffers(ctx)) < 0) {
4107 av_log(ctx, AV_LOG_ERROR,
4108 "Failed to allocate block buffers\n");
4111 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4114 for (i = 0; i < 4; i++) {
4115 for (j = 0; j < 2; j++)
4116 for (k = 0; k < 2; k++)
4117 for (l = 0; l < 6; l++)
4118 for (m = 0; m < 6; m++)
4119 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4120 s->prob.coef[i][j][k][l][m], 3);
4121 if (s->s.h.txfmmode == i)
4124 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4125 ff_thread_finish_setup(ctx);
4126 } else if (!s->s.h.refreshctx) {
4127 ff_thread_finish_setup(ctx);
4133 s->block = s->block_base;
4134 s->uvblock[0] = s->uvblock_base[0];
4135 s->uvblock[1] = s->uvblock_base[1];
4136 s->eob = s->eob_base;
4137 s->uveob[0] = s->uveob_base[0];
4138 s->uveob[1] = s->uveob_base[1];
4140 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4141 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4142 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4144 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4147 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4148 tile_row == s->s.h.tiling.tile_rows - 1) {
4151 tile_size = AV_RB32(data);
4155 if (tile_size > size) {
4156 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4157 return AVERROR_INVALIDDATA;
4159 res = ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4162 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4163 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4164 return AVERROR_INVALIDDATA;
4171 for (row = s->tile_row_start; row < s->tile_row_end;
4172 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4173 struct VP9Filter *lflvl_ptr = s->lflvl;
4174 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4176 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4177 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4178 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4181 memset(s->left_partition_ctx, 0, 8);
4182 memset(s->left_skip_ctx, 0, 8);
4183 if (s->s.h.keyframe || s->s.h.intraonly) {
4184 memset(s->left_mode_ctx, DC_PRED, 16);
4186 memset(s->left_mode_ctx, NEARESTMV, 8);
4188 memset(s->left_y_nnz_ctx, 0, 16);
4189 memset(s->left_uv_nnz_ctx, 0, 32);
4190 memset(s->left_segpred_ctx, 0, 8);
4192 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4195 for (col = s->tile_col_start;
4196 col < s->tile_col_end;
4197 col += 8, yoff2 += 64 * bytesperpixel,
4198 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4199 // FIXME integrate with lf code (i.e. zero after each
4200 // use, similar to invtxfm coefficients, or similar)
4202 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4206 decode_sb_mem(ctx, row, col, lflvl_ptr,
4207 yoff2, uvoff2, BL_64X64);
4209 decode_sb(ctx, row, col, lflvl_ptr,
4210 yoff2, uvoff2, BL_64X64);
4214 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4222 // backup pre-loopfilter reconstruction data for intra
4223 // prediction of next row of sb64s
4224 if (row + 8 < s->rows) {
4225 memcpy(s->intra_pred_data[0],
4226 f->data[0] + yoff + 63 * ls_y,
4227 8 * s->cols * bytesperpixel);
4228 memcpy(s->intra_pred_data[1],
4229 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4230 8 * s->cols * bytesperpixel >> s->ss_h);
4231 memcpy(s->intra_pred_data[2],
4232 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4233 8 * s->cols * bytesperpixel >> s->ss_h);
4236 // loopfilter one row
4237 if (s->s.h.filter.level) {
4240 lflvl_ptr = s->lflvl;
4241 for (col = 0; col < s->cols;
4242 col += 8, yoff2 += 64 * bytesperpixel,
4243 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4244 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4248 // FIXME maybe we can make this more finegrained by running the
4249 // loopfilter per-block instead of after each sbrow
4250 // In fact that would also make intra pred left preparation easier?
4251 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4255 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4257 ff_thread_finish_setup(ctx);
4259 } while (s->pass++ == 1);
4260 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4264 for (i = 0; i < 8; i++) {
4265 if (s->s.refs[i].f->buf[0])
4266 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4267 if (s->next_refs[i].f->buf[0] &&
4268 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4272 if (!s->s.h.invisible) {
4273 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4281 static void vp9_decode_flush(AVCodecContext *ctx)
4283 VP9Context *s = ctx->priv_data;
4286 for (i = 0; i < 3; i++)
4287 vp9_unref_frame(ctx, &s->s.frames[i]);
4288 for (i = 0; i < 8; i++)
4289 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4292 static int init_frames(AVCodecContext *ctx)
4294 VP9Context *s = ctx->priv_data;
4297 for (i = 0; i < 3; i++) {
4298 s->s.frames[i].tf.f = av_frame_alloc();
4299 if (!s->s.frames[i].tf.f) {
4300 vp9_decode_free(ctx);
4301 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4302 return AVERROR(ENOMEM);
4305 for (i = 0; i < 8; i++) {
4306 s->s.refs[i].f = av_frame_alloc();
4307 s->next_refs[i].f = av_frame_alloc();
4308 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4309 vp9_decode_free(ctx);
4310 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4311 return AVERROR(ENOMEM);
4318 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4320 VP9Context *s = ctx->priv_data;
4322 ctx->internal->allocate_progress = 1;
4324 s->s.h.filter.sharpness = -1;
4326 return init_frames(ctx);
4330 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4332 return init_frames(avctx);
4335 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4338 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4340 for (i = 0; i < 3; i++) {
4341 if (s->s.frames[i].tf.f->buf[0])
4342 vp9_unref_frame(dst, &s->s.frames[i]);
4343 if (ssrc->s.frames[i].tf.f->buf[0]) {
4344 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4348 for (i = 0; i < 8; i++) {
4349 if (s->s.refs[i].f->buf[0])
4350 ff_thread_release_buffer(dst, &s->s.refs[i]);
4351 if (ssrc->next_refs[i].f->buf[0]) {
4352 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4357 s->s.h.invisible = ssrc->s.h.invisible;
4358 s->s.h.keyframe = ssrc->s.h.keyframe;
4359 s->s.h.intraonly = ssrc->s.h.intraonly;
4360 s->ss_v = ssrc->ss_v;
4361 s->ss_h = ssrc->ss_h;
4362 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4363 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4364 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4365 s->bytesperpixel = ssrc->bytesperpixel;
4366 s->gf_fmt = ssrc->gf_fmt;
4369 s->s.h.bpp = ssrc->s.h.bpp;
4370 s->bpp_index = ssrc->bpp_index;
4371 s->pix_fmt = ssrc->pix_fmt;
4372 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4373 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4374 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4375 sizeof(s->s.h.segmentation.feat));
4381 AVCodec ff_vp9_decoder = {
4383 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4384 .type = AVMEDIA_TYPE_VIDEO,
4385 .id = AV_CODEC_ID_VP9,
4386 .priv_data_size = sizeof(VP9Context),
4387 .init = vp9_decode_init,
4388 .close = vp9_decode_free,
4389 .decode = vp9_decode_frame,
4390 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4391 .flush = vp9_decode_flush,
4392 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4393 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4394 .profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),