2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include "libavutil/avassert.h"
34 #include "libavutil/pixdesc.h"
36 #define VP9_SYNCCODE 0x498342
73 typedef struct VP9Frame {
75 AVBufferRef *extradata;
76 uint8_t *segmentation_map;
77 struct VP9mvrefPair *mv;
83 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
84 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
87 typedef struct VP9Block {
88 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
89 enum FilterMode filter;
90 VP56mv mv[4 /* b_idx */][2 /* ref */];
92 enum TxfmMode tx, uvtx;
94 enum BlockPartition bp;
97 typedef struct VP9Context {
104 VP9Block *b_base, *b;
106 int row, row7, col, col7;
108 ptrdiff_t y_stride, uv_stride;
111 uint8_t keyframe, last_keyframe;
112 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
114 uint8_t use_last_frame_mvs;
119 uint8_t refreshrefmask;
120 uint8_t highprecisionmvs;
121 enum FilterMode filtermode;
122 uint8_t allowcompinter;
125 uint8_t parallelmode;
129 uint8_t varcompref[2];
130 ThreadFrame refs[8], next_refs[8];
132 #define REF_FRAME_MVPAIR 1
133 #define REF_FRAME_SEGMAP 2
140 uint8_t mblim_lut[64];
148 int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
150 #define MAX_SEGMENT 8
154 uint8_t absolute_vals;
156 uint8_t ignore_refmap;
161 uint8_t skip_enabled;
170 unsigned log2_tile_cols, log2_tile_rows;
171 unsigned tile_cols, tile_rows;
172 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
174 unsigned sb_cols, sb_rows, rows, cols;
177 uint8_t coef[4][2][2][6][6][3];
181 uint8_t coef[4][2][2][6][6][11];
186 unsigned y_mode[4][10];
187 unsigned uv_mode[10][10];
188 unsigned filter[4][3];
189 unsigned mv_mode[7][4];
190 unsigned intra[4][2];
192 unsigned single_ref[5][2][2];
193 unsigned comp_ref[5][2];
194 unsigned tx32p[2][4];
195 unsigned tx16p[2][3];
198 unsigned mv_joint[4];
201 unsigned classes[11];
203 unsigned bits[10][2];
204 unsigned class0_fp[2][4];
206 unsigned class0_hp[2];
209 unsigned partition[4][4][4];
210 unsigned coef[4][2][2][6][6][3];
211 unsigned eob[4][2][2][6][6][2];
213 enum TxfmMode txfmmode;
214 enum CompPredMode comppredmode;
216 // contextual (left/above) cache
217 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
218 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
219 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
220 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
221 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
222 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
223 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
224 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
225 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
226 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
227 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
228 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
229 uint8_t *above_partition_ctx;
230 uint8_t *above_mode_ctx;
231 // FIXME maybe merge some of the below in a flags field?
232 uint8_t *above_y_nnz_ctx;
233 uint8_t *above_uv_nnz_ctx[2];
234 uint8_t *above_skip_ctx; // 1bit
235 uint8_t *above_txfm_ctx; // 2bit
236 uint8_t *above_segpred_ctx; // 1bit
237 uint8_t *above_intra_ctx; // 1bit
238 uint8_t *above_comp_ctx; // 1bit
239 uint8_t *above_ref_ctx; // 2bit
240 uint8_t *above_filter_ctx;
241 VP56mv (*above_mv_ctx)[2];
244 uint8_t *intra_pred_data[3];
245 struct VP9Filter *lflvl;
246 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
248 // block reconstruction intermediates
249 int block_alloc_using_2pass;
250 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
251 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
252 struct { int x, y; } min_mv, max_mv;
253 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
254 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
255 uint16_t mvscale[3][2];
256 uint8_t mvstep[3][2];
259 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
261 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
262 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
264 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
265 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
269 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
271 VP9Context *s = ctx->priv_data;
274 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
276 sz = 64 * s->sb_cols * s->sb_rows;
277 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
278 ff_thread_release_buffer(ctx, &f->tf);
279 return AVERROR(ENOMEM);
282 f->segmentation_map = f->extradata->data;
283 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
288 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
290 ff_thread_release_buffer(ctx, &f->tf);
291 av_buffer_unref(&f->extradata);
294 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
298 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
300 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301 vp9_unref_frame(ctx, dst);
302 return AVERROR(ENOMEM);
305 dst->segmentation_map = src->segmentation_map;
307 dst->uses_2pass = src->uses_2pass;
312 static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
314 VP9Context *s = ctx->priv_data;
316 int bytesperpixel = s->bytesperpixel;
318 av_assert0(w > 0 && h > 0);
320 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
326 s->sb_cols = (w + 63) >> 6;
327 s->sb_rows = (h + 63) >> 6;
328 s->cols = (w + 7) >> 3;
329 s->rows = (h + 7) >> 3;
331 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
332 av_freep(&s->intra_pred_data[0]);
333 // FIXME we slightly over-allocate here for subsampled chroma, but a little
334 // bit of padding shouldn't affect performance...
335 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
336 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
338 return AVERROR(ENOMEM);
339 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
340 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
341 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
342 assign(s->above_y_nnz_ctx, uint8_t *, 16);
343 assign(s->above_mode_ctx, uint8_t *, 16);
344 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
345 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
346 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
347 assign(s->above_partition_ctx, uint8_t *, 8);
348 assign(s->above_skip_ctx, uint8_t *, 8);
349 assign(s->above_txfm_ctx, uint8_t *, 8);
350 assign(s->above_segpred_ctx, uint8_t *, 8);
351 assign(s->above_intra_ctx, uint8_t *, 8);
352 assign(s->above_comp_ctx, uint8_t *, 8);
353 assign(s->above_ref_ctx, uint8_t *, 8);
354 assign(s->above_filter_ctx, uint8_t *, 8);
355 assign(s->lflvl, struct VP9Filter *, 1);
358 // these will be re-allocated a little later
359 av_freep(&s->b_base);
360 av_freep(&s->block_base);
362 if (s->bpp != s->last_bpp) {
363 ff_vp9dsp_init(&s->dsp, s->bpp);
364 ff_videodsp_init(&s->vdsp, s->bpp);
365 s->last_bpp = s->bpp;
371 static int update_block_buffers(AVCodecContext *ctx)
373 VP9Context *s = ctx->priv_data;
374 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
376 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
380 av_free(s->block_base);
381 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
382 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
383 if (s->frames[CUR_FRAME].uses_2pass) {
384 int sbs = s->sb_cols * s->sb_rows;
386 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
387 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
388 16 * 16 + 2 * chroma_eobs) * sbs);
389 if (!s->b_base || !s->block_base)
390 return AVERROR(ENOMEM);
391 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
392 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
393 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
394 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
395 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
397 s->b_base = av_malloc(sizeof(VP9Block));
398 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
399 16 * 16 + 2 * chroma_eobs);
400 if (!s->b_base || !s->block_base)
401 return AVERROR(ENOMEM);
402 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
403 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
404 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
405 s->uveob_base[0] = s->eob_base + 16 * 16;
406 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
408 s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
413 // for some reason the sign bit is at the end, not the start, of a bit sequence
414 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
416 int v = get_bits(gb, n);
417 return get_bits1(gb) ? -v : v;
420 static av_always_inline int inv_recenter_nonneg(int v, int m)
422 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
425 // differential forward probability updates
426 static int update_prob(VP56RangeCoder *c, int p)
428 static const int inv_map_table[254] = {
429 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
430 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
431 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
432 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
433 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
434 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
435 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
436 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
437 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
438 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
439 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
440 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
441 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
442 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
443 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
444 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
445 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
446 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
451 /* This code is trying to do a differential probability update. For a
452 * current probability A in the range [1, 255], the difference to a new
453 * probability of any value can be expressed differentially as 1-A,255-A
454 * where some part of this (absolute range) exists both in positive as
455 * well as the negative part, whereas another part only exists in one
456 * half. We're trying to code this shared part differentially, i.e.
457 * times two where the value of the lowest bit specifies the sign, and
458 * the single part is then coded on top of this. This absolute difference
459 * then again has a value of [0,254], but a bigger value in this range
460 * indicates that we're further away from the original value A, so we
461 * can code this as a VLC code, since higher values are increasingly
462 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
463 * updates vs. the 'fine, exact' updates further down the range, which
464 * adds one extra dimension to this differential update model. */
466 if (!vp8_rac_get(c)) {
467 d = vp8_rac_get_uint(c, 4) + 0;
468 } else if (!vp8_rac_get(c)) {
469 d = vp8_rac_get_uint(c, 4) + 16;
470 } else if (!vp8_rac_get(c)) {
471 d = vp8_rac_get_uint(c, 5) + 32;
473 d = vp8_rac_get_uint(c, 7);
475 d = (d << 1) - 65 + vp8_rac_get(c);
479 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
480 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
483 static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
485 static const enum AVColorSpace colorspaces[8] = {
486 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
487 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
489 VP9Context *s = ctx->priv_data;
490 enum AVPixelFormat res;
491 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
494 s->bpp = 8 + bits * 2;
495 s->bytesperpixel = (7 + s->bpp) >> 3;
496 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
497 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
498 static const enum AVPixelFormat pix_fmt_rgb[3] = {
499 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
501 if (ctx->profile & 1) {
502 s->ss_h = s->ss_v = 1;
503 res = pix_fmt_rgb[bits];
504 ctx->color_range = AVCOL_RANGE_JPEG;
506 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
508 return AVERROR_INVALIDDATA;
511 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
512 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
513 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
514 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
515 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
516 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
517 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
519 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
520 if (ctx->profile & 1) {
521 s->ss_h = get_bits1(&s->gb);
522 s->ss_v = get_bits1(&s->gb);
523 if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
524 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
526 return AVERROR_INVALIDDATA;
527 } else if (get_bits1(&s->gb)) {
528 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
530 return AVERROR_INVALIDDATA;
533 s->ss_h = s->ss_v = 1;
534 res = pix_fmt_for_ss[bits][1][1];
541 static int decode_frame_header(AVCodecContext *ctx,
542 const uint8_t *data, int size, int *ref)
544 VP9Context *s = ctx->priv_data;
545 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
546 enum AVPixelFormat fmt = ctx->pix_fmt;
548 const uint8_t *data2;
551 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
552 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
555 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
556 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
557 return AVERROR_INVALIDDATA;
559 ctx->profile = get_bits1(&s->gb);
560 ctx->profile |= get_bits1(&s->gb) << 1;
561 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
562 if (ctx->profile > 3) {
563 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
564 return AVERROR_INVALIDDATA;
566 if (get_bits1(&s->gb)) {
567 *ref = get_bits(&s->gb, 3);
570 s->last_keyframe = s->keyframe;
571 s->keyframe = !get_bits1(&s->gb);
572 last_invisible = s->invisible;
573 s->invisible = !get_bits1(&s->gb);
574 s->errorres = get_bits1(&s->gb);
575 s->use_last_frame_mvs = !s->errorres && !last_invisible;
577 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
578 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
579 return AVERROR_INVALIDDATA;
581 if ((fmt = read_colorspace_details(ctx)) < 0)
583 // for profile 1, here follows the subsampling bits
584 s->refreshrefmask = 0xff;
585 w = get_bits(&s->gb, 16) + 1;
586 h = get_bits(&s->gb, 16) + 1;
587 if (get_bits1(&s->gb)) // display size
588 skip_bits(&s->gb, 32);
590 s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
591 s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
593 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
594 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
595 return AVERROR_INVALIDDATA;
597 if (ctx->profile == 1) {
598 if ((fmt = read_colorspace_details(ctx)) < 0)
601 s->ss_h = s->ss_v = 1;
604 s->bytesperpixel = 1;
605 fmt = AV_PIX_FMT_YUV420P;
606 ctx->colorspace = AVCOL_SPC_BT470BG;
607 ctx->color_range = AVCOL_RANGE_JPEG;
609 s->refreshrefmask = get_bits(&s->gb, 8);
610 w = get_bits(&s->gb, 16) + 1;
611 h = get_bits(&s->gb, 16) + 1;
612 if (get_bits1(&s->gb)) // display size
613 skip_bits(&s->gb, 32);
615 s->refreshrefmask = get_bits(&s->gb, 8);
616 s->refidx[0] = get_bits(&s->gb, 3);
617 s->signbias[0] = get_bits1(&s->gb) && !s->errorres;
618 s->refidx[1] = get_bits(&s->gb, 3);
619 s->signbias[1] = get_bits1(&s->gb) && !s->errorres;
620 s->refidx[2] = get_bits(&s->gb, 3);
621 s->signbias[2] = get_bits1(&s->gb) && !s->errorres;
622 if (!s->refs[s->refidx[0]].f->data[0] ||
623 !s->refs[s->refidx[1]].f->data[0] ||
624 !s->refs[s->refidx[2]].f->data[0]) {
625 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
626 return AVERROR_INVALIDDATA;
628 if (get_bits1(&s->gb)) {
629 w = s->refs[s->refidx[0]].f->width;
630 h = s->refs[s->refidx[0]].f->height;
631 } else if (get_bits1(&s->gb)) {
632 w = s->refs[s->refidx[1]].f->width;
633 h = s->refs[s->refidx[1]].f->height;
634 } else if (get_bits1(&s->gb)) {
635 w = s->refs[s->refidx[2]].f->width;
636 h = s->refs[s->refidx[2]].f->height;
638 w = get_bits(&s->gb, 16) + 1;
639 h = get_bits(&s->gb, 16) + 1;
641 // Note that in this code, "CUR_FRAME" is actually before we
642 // have formally allocated a frame, and thus actually represents
644 s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
645 s->frames[CUR_FRAME].tf.f->height == h;
646 if (get_bits1(&s->gb)) // display size
647 skip_bits(&s->gb, 32);
648 s->highprecisionmvs = get_bits1(&s->gb);
649 s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
651 s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
652 s->signbias[0] != s->signbias[2]);
653 if (s->allowcompinter) {
654 if (s->signbias[0] == s->signbias[1]) {
656 s->varcompref[0] = 0;
657 s->varcompref[1] = 1;
658 } else if (s->signbias[0] == s->signbias[2]) {
660 s->varcompref[0] = 0;
661 s->varcompref[1] = 2;
664 s->varcompref[0] = 1;
665 s->varcompref[1] = 2;
669 for (i = 0; i < 3; i++) {
670 AVFrame *ref = s->refs[s->refidx[i]].f;
671 int refw = ref->width, refh = ref->height;
673 if (ref->format != fmt) {
674 av_log(ctx, AV_LOG_ERROR,
675 "Ref pixfmt (%s) did not match current frame (%s)",
676 av_get_pix_fmt_name(ref->format),
677 av_get_pix_fmt_name(fmt));
678 return AVERROR_INVALIDDATA;
679 } else if (refw == w && refh == h) {
680 s->mvscale[i][0] = s->mvscale[i][1] = 0;
682 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
683 av_log(ctx, AV_LOG_ERROR,
684 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
686 return AVERROR_INVALIDDATA;
688 s->mvscale[i][0] = (refw << 14) / w;
689 s->mvscale[i][1] = (refh << 14) / h;
690 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
691 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
696 s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
697 s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
698 s->framectxid = c = get_bits(&s->gb, 2);
700 /* loopfilter header data */
701 if (s->keyframe || s->errorres || s->intraonly) {
702 // reset loopfilter defaults
703 s->lf_delta.ref[0] = 1;
704 s->lf_delta.ref[1] = 0;
705 s->lf_delta.ref[2] = -1;
706 s->lf_delta.ref[3] = -1;
707 s->lf_delta.mode[0] = 0;
708 s->lf_delta.mode[1] = 0;
710 s->filter.level = get_bits(&s->gb, 6);
711 sharp = get_bits(&s->gb, 3);
712 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
713 // the old cache values since they are still valid
714 if (s->filter.sharpness != sharp)
715 memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
716 s->filter.sharpness = sharp;
717 if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
718 if (get_bits1(&s->gb)) {
719 for (i = 0; i < 4; i++)
720 if (get_bits1(&s->gb))
721 s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
722 for (i = 0; i < 2; i++)
723 if (get_bits1(&s->gb))
724 s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
728 /* quantization header data */
729 s->yac_qi = get_bits(&s->gb, 8);
730 s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
731 s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
732 s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
733 s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
734 s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
736 /* segmentation header info */
737 s->segmentation.ignore_refmap = 0;
738 if ((s->segmentation.enabled = get_bits1(&s->gb))) {
739 if ((s->segmentation.update_map = get_bits1(&s->gb))) {
740 for (i = 0; i < 7; i++)
741 s->prob.seg[i] = get_bits1(&s->gb) ?
742 get_bits(&s->gb, 8) : 255;
743 if ((s->segmentation.temporal = get_bits1(&s->gb))) {
744 for (i = 0; i < 3; i++)
745 s->prob.segpred[i] = get_bits1(&s->gb) ?
746 get_bits(&s->gb, 8) : 255;
749 if ((!s->segmentation.update_map || s->segmentation.temporal) &&
750 (w != s->frames[CUR_FRAME].tf.f->width ||
751 h != s->frames[CUR_FRAME].tf.f->height)) {
752 av_log(ctx, AV_LOG_WARNING,
753 "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
754 s->segmentation.temporal, s->segmentation.update_map);
755 s->segmentation.ignore_refmap = 1;
756 //return AVERROR_INVALIDDATA;
759 if (get_bits1(&s->gb)) {
760 s->segmentation.absolute_vals = get_bits1(&s->gb);
761 for (i = 0; i < 8; i++) {
762 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
763 s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
764 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
765 s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
766 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
767 s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
768 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
772 s->segmentation.feat[0].q_enabled = 0;
773 s->segmentation.feat[0].lf_enabled = 0;
774 s->segmentation.feat[0].skip_enabled = 0;
775 s->segmentation.feat[0].ref_enabled = 0;
778 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
779 for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
780 int qyac, qydc, quvac, quvdc, lflvl, sh;
782 if (s->segmentation.feat[i].q_enabled) {
783 if (s->segmentation.absolute_vals)
784 qyac = s->segmentation.feat[i].q_val;
786 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
790 qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
791 quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
792 quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
793 qyac = av_clip_uintp2(qyac, 8);
795 s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
796 s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
797 s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
798 s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
800 sh = s->filter.level >= 32;
801 if (s->segmentation.feat[i].lf_enabled) {
802 if (s->segmentation.absolute_vals)
803 lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
805 lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
807 lflvl = s->filter.level;
809 if (s->lf_delta.enabled) {
810 s->segmentation.feat[i].lflvl[0][0] =
811 s->segmentation.feat[i].lflvl[0][1] =
812 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
813 for (j = 1; j < 4; j++) {
814 s->segmentation.feat[i].lflvl[j][0] =
815 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
816 s->lf_delta.mode[0]) * (1 << sh)), 6);
817 s->segmentation.feat[i].lflvl[j][1] =
818 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
819 s->lf_delta.mode[1]) * (1 << sh)), 6);
822 memset(s->segmentation.feat[i].lflvl, lflvl,
823 sizeof(s->segmentation.feat[i].lflvl));
828 if ((res = update_size(ctx, w, h, fmt)) < 0) {
829 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
832 for (s->tiling.log2_tile_cols = 0;
833 (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
834 s->tiling.log2_tile_cols++) ;
835 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
836 max = FFMAX(0, max - 1);
837 while (max > s->tiling.log2_tile_cols) {
838 if (get_bits1(&s->gb))
839 s->tiling.log2_tile_cols++;
843 s->tiling.log2_tile_rows = decode012(&s->gb);
844 s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
845 if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
846 s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
847 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
848 sizeof(VP56RangeCoder) * s->tiling.tile_cols);
850 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
851 return AVERROR(ENOMEM);
855 if (s->keyframe || s->errorres || s->intraonly) {
856 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
857 s->prob_ctx[3].p = vp9_default_probs;
858 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
859 sizeof(vp9_default_coef_probs));
860 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
861 sizeof(vp9_default_coef_probs));
862 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
863 sizeof(vp9_default_coef_probs));
864 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
865 sizeof(vp9_default_coef_probs));
868 // next 16 bits is size of the rest of the header (arith-coded)
869 size2 = get_bits(&s->gb, 16);
870 data2 = align_get_bits(&s->gb);
871 if (size2 > size - (data2 - data)) {
872 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
873 return AVERROR_INVALIDDATA;
875 ff_vp56_init_range_decoder(&s->c, data2, size2);
876 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
877 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
878 return AVERROR_INVALIDDATA;
881 if (s->keyframe || s->intraonly) {
882 memset(s->counts.coef, 0, sizeof(s->counts.coef));
883 memset(s->counts.eob, 0, sizeof(s->counts.eob));
885 memset(&s->counts, 0, sizeof(s->counts));
887 // FIXME is it faster to not copy here, but do it down in the fw updates
888 // as explicit copies if the fw update is missing (and skip the copy upon
890 s->prob.p = s->prob_ctx[c].p;
894 s->txfmmode = TX_4X4;
896 s->txfmmode = vp8_rac_get_uint(&s->c, 2);
897 if (s->txfmmode == 3)
898 s->txfmmode += vp8_rac_get(&s->c);
900 if (s->txfmmode == TX_SWITCHABLE) {
901 for (i = 0; i < 2; i++)
902 if (vp56_rac_get_prob_branchy(&s->c, 252))
903 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
904 for (i = 0; i < 2; i++)
905 for (j = 0; j < 2; j++)
906 if (vp56_rac_get_prob_branchy(&s->c, 252))
907 s->prob.p.tx16p[i][j] =
908 update_prob(&s->c, s->prob.p.tx16p[i][j]);
909 for (i = 0; i < 2; i++)
910 for (j = 0; j < 3; j++)
911 if (vp56_rac_get_prob_branchy(&s->c, 252))
912 s->prob.p.tx32p[i][j] =
913 update_prob(&s->c, s->prob.p.tx32p[i][j]);
918 for (i = 0; i < 4; i++) {
919 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
920 if (vp8_rac_get(&s->c)) {
921 for (j = 0; j < 2; j++)
922 for (k = 0; k < 2; k++)
923 for (l = 0; l < 6; l++)
924 for (m = 0; m < 6; m++) {
925 uint8_t *p = s->prob.coef[i][j][k][l][m];
926 uint8_t *r = ref[j][k][l][m];
927 if (m >= 3 && l == 0) // dc only has 3 pt
929 for (n = 0; n < 3; n++) {
930 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
931 p[n] = update_prob(&s->c, r[n]);
939 for (j = 0; j < 2; j++)
940 for (k = 0; k < 2; k++)
941 for (l = 0; l < 6; l++)
942 for (m = 0; m < 6; m++) {
943 uint8_t *p = s->prob.coef[i][j][k][l][m];
944 uint8_t *r = ref[j][k][l][m];
945 if (m > 3 && l == 0) // dc only has 3 pt
951 if (s->txfmmode == i)
956 for (i = 0; i < 3; i++)
957 if (vp56_rac_get_prob_branchy(&s->c, 252))
958 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
959 if (!s->keyframe && !s->intraonly) {
960 for (i = 0; i < 7; i++)
961 for (j = 0; j < 3; j++)
962 if (vp56_rac_get_prob_branchy(&s->c, 252))
963 s->prob.p.mv_mode[i][j] =
964 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
966 if (s->filtermode == FILTER_SWITCHABLE)
967 for (i = 0; i < 4; i++)
968 for (j = 0; j < 2; j++)
969 if (vp56_rac_get_prob_branchy(&s->c, 252))
970 s->prob.p.filter[i][j] =
971 update_prob(&s->c, s->prob.p.filter[i][j]);
973 for (i = 0; i < 4; i++)
974 if (vp56_rac_get_prob_branchy(&s->c, 252))
975 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
977 if (s->allowcompinter) {
978 s->comppredmode = vp8_rac_get(&s->c);
980 s->comppredmode += vp8_rac_get(&s->c);
981 if (s->comppredmode == PRED_SWITCHABLE)
982 for (i = 0; i < 5; i++)
983 if (vp56_rac_get_prob_branchy(&s->c, 252))
985 update_prob(&s->c, s->prob.p.comp[i]);
987 s->comppredmode = PRED_SINGLEREF;
990 if (s->comppredmode != PRED_COMPREF) {
991 for (i = 0; i < 5; i++) {
992 if (vp56_rac_get_prob_branchy(&s->c, 252))
993 s->prob.p.single_ref[i][0] =
994 update_prob(&s->c, s->prob.p.single_ref[i][0]);
995 if (vp56_rac_get_prob_branchy(&s->c, 252))
996 s->prob.p.single_ref[i][1] =
997 update_prob(&s->c, s->prob.p.single_ref[i][1]);
1001 if (s->comppredmode != PRED_SINGLEREF) {
1002 for (i = 0; i < 5; i++)
1003 if (vp56_rac_get_prob_branchy(&s->c, 252))
1004 s->prob.p.comp_ref[i] =
1005 update_prob(&s->c, s->prob.p.comp_ref[i]);
1008 for (i = 0; i < 4; i++)
1009 for (j = 0; j < 9; j++)
1010 if (vp56_rac_get_prob_branchy(&s->c, 252))
1011 s->prob.p.y_mode[i][j] =
1012 update_prob(&s->c, s->prob.p.y_mode[i][j]);
1014 for (i = 0; i < 4; i++)
1015 for (j = 0; j < 4; j++)
1016 for (k = 0; k < 3; k++)
1017 if (vp56_rac_get_prob_branchy(&s->c, 252))
1018 s->prob.p.partition[3 - i][j][k] =
1019 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1021 // mv fields don't use the update_prob subexp model for some reason
1022 for (i = 0; i < 3; i++)
1023 if (vp56_rac_get_prob_branchy(&s->c, 252))
1024 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1026 for (i = 0; i < 2; i++) {
1027 if (vp56_rac_get_prob_branchy(&s->c, 252))
1028 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1030 for (j = 0; j < 10; j++)
1031 if (vp56_rac_get_prob_branchy(&s->c, 252))
1032 s->prob.p.mv_comp[i].classes[j] =
1033 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1035 if (vp56_rac_get_prob_branchy(&s->c, 252))
1036 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1038 for (j = 0; j < 10; j++)
1039 if (vp56_rac_get_prob_branchy(&s->c, 252))
1040 s->prob.p.mv_comp[i].bits[j] =
1041 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1044 for (i = 0; i < 2; i++) {
1045 for (j = 0; j < 2; j++)
1046 for (k = 0; k < 3; k++)
1047 if (vp56_rac_get_prob_branchy(&s->c, 252))
1048 s->prob.p.mv_comp[i].class0_fp[j][k] =
1049 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1051 for (j = 0; j < 3; j++)
1052 if (vp56_rac_get_prob_branchy(&s->c, 252))
1053 s->prob.p.mv_comp[i].fp[j] =
1054 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1057 if (s->highprecisionmvs) {
1058 for (i = 0; i < 2; i++) {
1059 if (vp56_rac_get_prob_branchy(&s->c, 252))
1060 s->prob.p.mv_comp[i].class0_hp =
1061 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1063 if (vp56_rac_get_prob_branchy(&s->c, 252))
1064 s->prob.p.mv_comp[i].hp =
1065 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1070 return (data2 - data) + size2;
1073 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1076 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1077 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1080 static void find_ref_mvs(VP9Context *s,
1081 VP56mv *pmv, int ref, int z, int idx, int sb)
1083 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1084 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1085 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1086 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1087 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1088 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1089 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1090 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1091 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1092 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1093 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1094 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1095 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1096 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1097 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1098 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1099 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1100 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1101 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1102 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1103 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1104 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1105 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1106 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1107 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1108 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1109 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1112 int row = s->row, col = s->col, row7 = s->row7;
1113 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1114 #define INVALID_MV 0x80008000U
1115 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1118 #define RETURN_DIRECT_MV(mv) \
1120 uint32_t m = AV_RN32A(&mv); \
1124 } else if (mem == INVALID_MV) { \
1126 } else if (m != mem) { \
1133 if (sb == 2 || sb == 1) {
1134 RETURN_DIRECT_MV(b->mv[0][z]);
1135 } else if (sb == 3) {
1136 RETURN_DIRECT_MV(b->mv[2][z]);
1137 RETURN_DIRECT_MV(b->mv[1][z]);
1138 RETURN_DIRECT_MV(b->mv[0][z]);
1141 #define RETURN_MV(mv) \
1146 av_assert2(idx == 1); \
1147 av_assert2(mem != INVALID_MV); \
1148 if (mem_sub8x8 == INVALID_MV) { \
1149 clamp_mv(&tmp, &mv, s); \
1150 m = AV_RN32A(&tmp); \
1155 mem_sub8x8 = AV_RN32A(&mv); \
1156 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1157 clamp_mv(&tmp, &mv, s); \
1158 m = AV_RN32A(&tmp); \
1162 /* BUG I'm pretty sure this isn't the intention */ \
1168 uint32_t m = AV_RN32A(&mv); \
1170 clamp_mv(pmv, &mv, s); \
1172 } else if (mem == INVALID_MV) { \
1174 } else if (m != mem) { \
1175 clamp_mv(pmv, &mv, s); \
1182 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1183 if (mv->ref[0] == ref) {
1184 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1185 } else if (mv->ref[1] == ref) {
1186 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1189 if (col > s->tiling.tile_col_start) {
1190 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1191 if (mv->ref[0] == ref) {
1192 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1193 } else if (mv->ref[1] == ref) {
1194 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1202 // previously coded MVs in this neighbourhood, using same reference frame
1203 for (; i < 8; i++) {
1204 int c = p[i][0] + col, r = p[i][1] + row;
1206 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1207 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1209 if (mv->ref[0] == ref) {
1210 RETURN_MV(mv->mv[0]);
1211 } else if (mv->ref[1] == ref) {
1212 RETURN_MV(mv->mv[1]);
1217 // MV at this position in previous frame, using same reference frame
1218 if (s->use_last_frame_mvs) {
1219 struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1221 if (!s->frames[REF_FRAME_MVPAIR].uses_2pass)
1222 ff_thread_await_progress(&s->frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1223 if (mv->ref[0] == ref) {
1224 RETURN_MV(mv->mv[0]);
1225 } else if (mv->ref[1] == ref) {
1226 RETURN_MV(mv->mv[1]);
1230 #define RETURN_SCALE_MV(mv, scale) \
1233 VP56mv mv_temp = { -mv.x, -mv.y }; \
1234 RETURN_MV(mv_temp); \
1240 // previously coded MVs in this neighbourhood, using different reference frame
1241 for (i = 0; i < 8; i++) {
1242 int c = p[i][0] + col, r = p[i][1] + row;
1244 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1245 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1247 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1248 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1250 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1251 // BUG - libvpx has this condition regardless of whether
1252 // we used the first ref MV and pre-scaling
1253 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1254 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1259 // MV at this position in previous frame, using different reference frame
1260 if (s->use_last_frame_mvs) {
1261 struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1263 // no need to await_progress, because we already did that above
1264 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1265 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1267 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1268 // BUG - libvpx has this condition regardless of whether
1269 // we used the first ref MV and pre-scaling
1270 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1271 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1276 clamp_mv(pmv, pmv, s);
1279 #undef RETURN_SCALE_MV
1282 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1284 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1285 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1286 s->prob.p.mv_comp[idx].classes);
1288 s->counts.mv_comp[idx].sign[sign]++;
1289 s->counts.mv_comp[idx].classes[c]++;
1293 for (n = 0, m = 0; m < c; m++) {
1294 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1296 s->counts.mv_comp[idx].bits[m][bit]++;
1299 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1301 s->counts.mv_comp[idx].fp[bit]++;
1303 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1304 s->counts.mv_comp[idx].hp[bit]++;
1308 // bug in libvpx - we count for bw entropy purposes even if the
1310 s->counts.mv_comp[idx].hp[1]++;
1314 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1315 s->counts.mv_comp[idx].class0[n]++;
1316 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1317 s->prob.p.mv_comp[idx].class0_fp[n]);
1318 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1319 n = (n << 3) | (bit << 1);
1321 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1322 s->counts.mv_comp[idx].class0_hp[bit]++;
1326 // bug in libvpx - we count for bw entropy purposes even if the
1328 s->counts.mv_comp[idx].class0_hp[1]++;
1332 return sign ? -(n + 1) : (n + 1);
1335 static void fill_mv(VP9Context *s,
1336 VP56mv *mv, int mode, int sb)
1340 if (mode == ZEROMV) {
1345 // FIXME cache this value and reuse for other subblocks
1346 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1347 mode == NEWMV ? -1 : sb);
1348 // FIXME maybe move this code into find_ref_mvs()
1349 if ((mode == NEWMV || sb == -1) &&
1350 !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1364 if (mode == NEWMV) {
1365 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1366 s->prob.p.mv_joint);
1368 s->counts.mv_joint[j]++;
1369 if (j >= MV_JOINT_V)
1370 mv[0].y += read_mv_component(s, 0, hp);
1372 mv[0].x += read_mv_component(s, 1, hp);
1376 // FIXME cache this value and reuse for other subblocks
1377 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1378 mode == NEWMV ? -1 : sb);
1379 if ((mode == NEWMV || sb == -1) &&
1380 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1394 if (mode == NEWMV) {
1395 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1396 s->prob.p.mv_joint);
1398 s->counts.mv_joint[j]++;
1399 if (j >= MV_JOINT_V)
1400 mv[1].y += read_mv_component(s, 0, hp);
1402 mv[1].x += read_mv_component(s, 1, hp);
1408 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1409 ptrdiff_t stride, int v)
1419 int v16 = v * 0x0101;
1427 uint32_t v32 = v * 0x01010101;
1436 uint64_t v64 = v * 0x0101010101010101ULL;
1442 uint32_t v32 = v * 0x01010101;
1445 AV_WN32A(ptr + 4, v32);
1454 static void decode_mode(AVCodecContext *ctx)
1456 static const uint8_t left_ctx[N_BS_SIZES] = {
1457 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1459 static const uint8_t above_ctx[N_BS_SIZES] = {
1460 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1462 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1463 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1464 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1466 VP9Context *s = ctx->priv_data;
1468 int row = s->row, col = s->col, row7 = s->row7;
1469 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1470 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1471 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1472 int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1473 int vref, filter_id;
1475 if (!s->segmentation.enabled) {
1477 } else if (s->keyframe || s->intraonly) {
1478 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1479 } else if (!s->segmentation.update_map ||
1480 (s->segmentation.temporal &&
1481 vp56_rac_get_prob_branchy(&s->c,
1482 s->prob.segpred[s->above_segpred_ctx[col] +
1483 s->left_segpred_ctx[row7]]))) {
1484 if (!s->errorres && !s->segmentation.ignore_refmap) {
1486 uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
1488 if (!s->frames[REF_FRAME_SEGMAP].uses_2pass)
1489 ff_thread_await_progress(&s->frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1490 for (y = 0; y < h4; y++) {
1491 int idx_base = (y + row) * 8 * s->sb_cols + col;
1492 for (x = 0; x < w4; x++)
1493 pred = FFMIN(pred, refsegmap[idx_base + x]);
1495 av_assert1(pred < 8);
1501 memset(&s->above_segpred_ctx[col], 1, w4);
1502 memset(&s->left_segpred_ctx[row7], 1, h4);
1504 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1507 memset(&s->above_segpred_ctx[col], 0, w4);
1508 memset(&s->left_segpred_ctx[row7], 0, h4);
1510 if (s->segmentation.enabled &&
1511 (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1512 setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1513 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1516 b->skip = s->segmentation.enabled &&
1517 s->segmentation.feat[b->seg_id].skip_enabled;
1519 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1520 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1521 s->counts.skip[c][b->skip]++;
1524 if (s->keyframe || s->intraonly) {
1526 } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1527 b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1531 if (have_a && have_l) {
1532 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1535 c = have_a ? 2 * s->above_intra_ctx[col] :
1536 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1538 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1539 s->counts.intra[c][bit]++;
1543 if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1547 c = (s->above_skip_ctx[col] ? max_tx :
1548 s->above_txfm_ctx[col]) +
1549 (s->left_skip_ctx[row7] ? max_tx :
1550 s->left_txfm_ctx[row7]) > max_tx;
1552 c = s->above_skip_ctx[col] ? 1 :
1553 (s->above_txfm_ctx[col] * 2 > max_tx);
1555 } else if (have_l) {
1556 c = s->left_skip_ctx[row7] ? 1 :
1557 (s->left_txfm_ctx[row7] * 2 > max_tx);
1563 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1565 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1567 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1569 s->counts.tx32p[c][b->tx]++;
1572 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1574 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1575 s->counts.tx16p[c][b->tx]++;
1578 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1579 s->counts.tx8p[c][b->tx]++;
1586 b->tx = FFMIN(max_tx, s->txfmmode);
1589 if (s->keyframe || s->intraonly) {
1590 uint8_t *a = &s->above_mode_ctx[col * 2];
1591 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1594 if (b->bs > BS_8x8) {
1595 // FIXME the memory storage intermediates here aren't really
1596 // necessary, they're just there to make the code slightly
1598 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1599 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1600 if (b->bs != BS_8x4) {
1601 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1602 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1603 l[0] = a[1] = b->mode[1];
1605 l[0] = a[1] = b->mode[1] = b->mode[0];
1607 if (b->bs != BS_4x8) {
1608 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1609 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1610 if (b->bs != BS_8x4) {
1611 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1612 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1613 l[1] = a[1] = b->mode[3];
1615 l[1] = a[1] = b->mode[3] = b->mode[2];
1618 b->mode[2] = b->mode[0];
1619 l[1] = a[1] = b->mode[3] = b->mode[1];
1622 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1623 vp9_default_kf_ymode_probs[*a][*l]);
1624 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1625 // FIXME this can probably be optimized
1626 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1627 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1629 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1630 vp9_default_kf_uvmode_probs[b->mode[3]]);
1631 } else if (b->intra) {
1633 if (b->bs > BS_8x8) {
1634 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1635 s->prob.p.y_mode[0]);
1636 s->counts.y_mode[0][b->mode[0]]++;
1637 if (b->bs != BS_8x4) {
1638 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1639 s->prob.p.y_mode[0]);
1640 s->counts.y_mode[0][b->mode[1]]++;
1642 b->mode[1] = b->mode[0];
1644 if (b->bs != BS_4x8) {
1645 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1646 s->prob.p.y_mode[0]);
1647 s->counts.y_mode[0][b->mode[2]]++;
1648 if (b->bs != BS_8x4) {
1649 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1650 s->prob.p.y_mode[0]);
1651 s->counts.y_mode[0][b->mode[3]]++;
1653 b->mode[3] = b->mode[2];
1656 b->mode[2] = b->mode[0];
1657 b->mode[3] = b->mode[1];
1660 static const uint8_t size_group[10] = {
1661 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1663 int sz = size_group[b->bs];
1665 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1666 s->prob.p.y_mode[sz]);
1667 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1668 s->counts.y_mode[sz][b->mode[3]]++;
1670 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1671 s->prob.p.uv_mode[b->mode[3]]);
1672 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1674 static const uint8_t inter_mode_ctx_lut[14][14] = {
1675 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1676 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1677 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1678 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1679 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1680 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1681 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1682 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1683 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1684 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1685 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1686 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1687 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1688 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1691 if (s->segmentation.feat[b->seg_id].ref_enabled) {
1692 av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1694 b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1696 // read comp_pred flag
1697 if (s->comppredmode != PRED_SWITCHABLE) {
1698 b->comp = s->comppredmode == PRED_COMPREF;
1702 // FIXME add intra as ref=0xff (or -1) to make these easier?
1705 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1707 } else if (s->above_comp_ctx[col]) {
1708 c = 2 + (s->left_intra_ctx[row7] ||
1709 s->left_ref_ctx[row7] == s->fixcompref);
1710 } else if (s->left_comp_ctx[row7]) {
1711 c = 2 + (s->above_intra_ctx[col] ||
1712 s->above_ref_ctx[col] == s->fixcompref);
1714 c = (!s->above_intra_ctx[col] &&
1715 s->above_ref_ctx[col] == s->fixcompref) ^
1716 (!s->left_intra_ctx[row7] &&
1717 s->left_ref_ctx[row & 7] == s->fixcompref);
1720 c = s->above_comp_ctx[col] ? 3 :
1721 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1723 } else if (have_l) {
1724 c = s->left_comp_ctx[row7] ? 3 :
1725 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1729 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1730 s->counts.comp[c][b->comp]++;
1733 // read actual references
1734 // FIXME probably cache a few variables here to prevent repetitive
1735 // memory accesses below
1736 if (b->comp) /* two references */ {
1737 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1739 b->ref[fix_idx] = s->fixcompref;
1740 // FIXME can this codeblob be replaced by some sort of LUT?
1743 if (s->above_intra_ctx[col]) {
1744 if (s->left_intra_ctx[row7]) {
1747 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1749 } else if (s->left_intra_ctx[row7]) {
1750 c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1752 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1754 if (refl == refa && refa == s->varcompref[1]) {
1756 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1757 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1758 (refl == s->fixcompref && refa == s->varcompref[0])) {
1761 c = (refa == refl) ? 3 : 1;
1763 } else if (!s->left_comp_ctx[row7]) {
1764 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1767 c = (refl == s->varcompref[1] &&
1768 refa != s->varcompref[1]) ? 2 : 4;
1770 } else if (!s->above_comp_ctx[col]) {
1771 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1774 c = (refa == s->varcompref[1] &&
1775 refl != s->varcompref[1]) ? 2 : 4;
1778 c = (refl == refa) ? 4 : 2;
1782 if (s->above_intra_ctx[col]) {
1784 } else if (s->above_comp_ctx[col]) {
1785 c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1787 c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1790 } else if (have_l) {
1791 if (s->left_intra_ctx[row7]) {
1793 } else if (s->left_comp_ctx[row7]) {
1794 c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1796 c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1801 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1802 b->ref[var_idx] = s->varcompref[bit];
1803 s->counts.comp_ref[c][bit]++;
1804 } else /* single reference */ {
1807 if (have_a && !s->above_intra_ctx[col]) {
1808 if (have_l && !s->left_intra_ctx[row7]) {
1809 if (s->left_comp_ctx[row7]) {
1810 if (s->above_comp_ctx[col]) {
1811 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1812 !s->above_ref_ctx[col]);
1814 c = (3 * !s->above_ref_ctx[col]) +
1815 (!s->fixcompref || !s->left_ref_ctx[row7]);
1817 } else if (s->above_comp_ctx[col]) {
1818 c = (3 * !s->left_ref_ctx[row7]) +
1819 (!s->fixcompref || !s->above_ref_ctx[col]);
1821 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1823 } else if (s->above_intra_ctx[col]) {
1825 } else if (s->above_comp_ctx[col]) {
1826 c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1828 c = 4 * (!s->above_ref_ctx[col]);
1830 } else if (have_l && !s->left_intra_ctx[row7]) {
1831 if (s->left_intra_ctx[row7]) {
1833 } else if (s->left_comp_ctx[row7]) {
1834 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1836 c = 4 * (!s->left_ref_ctx[row7]);
1841 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1842 s->counts.single_ref[c][0][bit]++;
1846 // FIXME can this codeblob be replaced by some sort of LUT?
1849 if (s->left_intra_ctx[row7]) {
1850 if (s->above_intra_ctx[col]) {
1852 } else if (s->above_comp_ctx[col]) {
1853 c = 1 + 2 * (s->fixcompref == 1 ||
1854 s->above_ref_ctx[col] == 1);
1855 } else if (!s->above_ref_ctx[col]) {
1858 c = 4 * (s->above_ref_ctx[col] == 1);
1860 } else if (s->above_intra_ctx[col]) {
1861 if (s->left_intra_ctx[row7]) {
1863 } else if (s->left_comp_ctx[row7]) {
1864 c = 1 + 2 * (s->fixcompref == 1 ||
1865 s->left_ref_ctx[row7] == 1);
1866 } else if (!s->left_ref_ctx[row7]) {
1869 c = 4 * (s->left_ref_ctx[row7] == 1);
1871 } else if (s->above_comp_ctx[col]) {
1872 if (s->left_comp_ctx[row7]) {
1873 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1874 c = 3 * (s->fixcompref == 1 ||
1875 s->left_ref_ctx[row7] == 1);
1879 } else if (!s->left_ref_ctx[row7]) {
1880 c = 1 + 2 * (s->fixcompref == 1 ||
1881 s->above_ref_ctx[col] == 1);
1883 c = 3 * (s->left_ref_ctx[row7] == 1) +
1884 (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1886 } else if (s->left_comp_ctx[row7]) {
1887 if (!s->above_ref_ctx[col]) {
1888 c = 1 + 2 * (s->fixcompref == 1 ||
1889 s->left_ref_ctx[row7] == 1);
1891 c = 3 * (s->above_ref_ctx[col] == 1) +
1892 (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1894 } else if (!s->above_ref_ctx[col]) {
1895 if (!s->left_ref_ctx[row7]) {
1898 c = 4 * (s->left_ref_ctx[row7] == 1);
1900 } else if (!s->left_ref_ctx[row7]) {
1901 c = 4 * (s->above_ref_ctx[col] == 1);
1903 c = 2 * (s->left_ref_ctx[row7] == 1) +
1904 2 * (s->above_ref_ctx[col] == 1);
1907 if (s->above_intra_ctx[col] ||
1908 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1910 } else if (s->above_comp_ctx[col]) {
1911 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1913 c = 4 * (s->above_ref_ctx[col] == 1);
1916 } else if (have_l) {
1917 if (s->left_intra_ctx[row7] ||
1918 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1920 } else if (s->left_comp_ctx[row7]) {
1921 c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1923 c = 4 * (s->left_ref_ctx[row7] == 1);
1928 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1929 s->counts.single_ref[c][1][bit]++;
1930 b->ref[0] = 1 + bit;
1935 if (b->bs <= BS_8x8) {
1936 if (s->segmentation.feat[b->seg_id].skip_enabled) {
1937 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1939 static const uint8_t off[10] = {
1940 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1943 // FIXME this needs to use the LUT tables from find_ref_mvs
1944 // because not all are -1,0/0,-1
1945 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1946 [s->left_mode_ctx[row7 + off[b->bs]]];
1948 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1949 s->prob.p.mv_mode[c]);
1950 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1951 s->counts.mv_mode[c][b->mode[0] - 10]++;
1955 if (s->filtermode == FILTER_SWITCHABLE) {
1958 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1959 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1960 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1961 s->left_filter_ctx[row7] : 3;
1963 c = s->above_filter_ctx[col];
1965 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1966 c = s->left_filter_ctx[row7];
1971 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1972 s->prob.p.filter[c]);
1973 s->counts.filter[c][filter_id]++;
1974 b->filter = vp9_filter_lut[filter_id];
1976 b->filter = s->filtermode;
1979 if (b->bs > BS_8x8) {
1980 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1982 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1983 s->prob.p.mv_mode[c]);
1984 s->counts.mv_mode[c][b->mode[0] - 10]++;
1985 fill_mv(s, b->mv[0], b->mode[0], 0);
1987 if (b->bs != BS_8x4) {
1988 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1989 s->prob.p.mv_mode[c]);
1990 s->counts.mv_mode[c][b->mode[1] - 10]++;
1991 fill_mv(s, b->mv[1], b->mode[1], 1);
1993 b->mode[1] = b->mode[0];
1994 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1995 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1998 if (b->bs != BS_4x8) {
1999 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2000 s->prob.p.mv_mode[c]);
2001 s->counts.mv_mode[c][b->mode[2] - 10]++;
2002 fill_mv(s, b->mv[2], b->mode[2], 2);
2004 if (b->bs != BS_8x4) {
2005 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2006 s->prob.p.mv_mode[c]);
2007 s->counts.mv_mode[c][b->mode[3] - 10]++;
2008 fill_mv(s, b->mv[3], b->mode[3], 3);
2010 b->mode[3] = b->mode[2];
2011 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
2012 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
2015 b->mode[2] = b->mode[0];
2016 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2017 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2018 b->mode[3] = b->mode[1];
2019 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2020 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2023 fill_mv(s, b->mv[0], b->mode[0], -1);
2024 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2025 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2026 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2027 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2028 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2029 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2032 vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
2036 #define SPLAT_CTX(var, val, n) \
2038 case 1: var = val; break; \
2039 case 2: AV_WN16A(&var, val * 0x0101); break; \
2040 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2041 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2043 uint64_t v64 = val * 0x0101010101010101ULL; \
2044 AV_WN64A( &var, v64); \
2045 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2050 #define SPLAT_CTX(var, val, n) \
2052 case 1: var = val; break; \
2053 case 2: AV_WN16A(&var, val * 0x0101); break; \
2054 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2056 uint32_t v32 = val * 0x01010101; \
2057 AV_WN32A( &var, v32); \
2058 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2062 uint32_t v32 = val * 0x01010101; \
2063 AV_WN32A( &var, v32); \
2064 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2065 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2066 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2072 switch (bwh_tab[1][b->bs][0]) {
2073 #define SET_CTXS(dir, off, n) \
2075 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2076 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2077 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2078 if (!s->keyframe && !s->intraonly) { \
2079 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2080 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2081 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2083 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2084 if (s->filtermode == FILTER_SWITCHABLE) { \
2085 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2090 case 1: SET_CTXS(above, col, 1); break;
2091 case 2: SET_CTXS(above, col, 2); break;
2092 case 4: SET_CTXS(above, col, 4); break;
2093 case 8: SET_CTXS(above, col, 8); break;
2095 switch (bwh_tab[1][b->bs][1]) {
2096 case 1: SET_CTXS(left, row7, 1); break;
2097 case 2: SET_CTXS(left, row7, 2); break;
2098 case 4: SET_CTXS(left, row7, 4); break;
2099 case 8: SET_CTXS(left, row7, 8); break;
2104 if (!s->keyframe && !s->intraonly) {
2105 if (b->bs > BS_8x8) {
2106 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2108 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2109 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2110 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2111 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2112 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2113 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2114 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2115 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2117 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2119 for (n = 0; n < w4 * 2; n++) {
2120 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2121 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2123 for (n = 0; n < h4 * 2; n++) {
2124 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2125 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2131 for (y = 0; y < h4; y++) {
2132 int x, o = (row + y) * s->sb_cols * 8 + col;
2133 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2136 for (x = 0; x < w4; x++) {
2140 } else if (b->comp) {
2141 for (x = 0; x < w4; x++) {
2142 mv[x].ref[0] = b->ref[0];
2143 mv[x].ref[1] = b->ref[1];
2144 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2145 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2148 for (x = 0; x < w4; x++) {
2149 mv[x].ref[0] = b->ref[0];
2151 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2157 // FIXME merge cnt/eob arguments?
2158 static av_always_inline int
2159 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2160 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2161 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2162 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2163 const int16_t *band_counts, const int16_t *qmul)
2165 int i = 0, band = 0, band_left = band_counts[band];
2166 uint8_t *tp = p[0][nnz];
2167 uint8_t cache[1024];
2172 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2173 eob[band][nnz][val]++;
2178 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2179 cnt[band][nnz][0]++;
2181 band_left = band_counts[++band];
2183 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2185 if (++i == n_coeffs)
2186 break; //invalid input; blocks should end with EOB
2191 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2192 cnt[band][nnz][1]++;
2196 // fill in p[3-10] (model fill) - only once per frame for each pos
2198 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2200 cnt[band][nnz][2]++;
2201 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2202 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2203 cache[rc] = val = 2;
2205 val = 3 + vp56_rac_get_prob(c, tp[5]);
2208 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2210 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2211 val = 5 + vp56_rac_get_prob(c, 159);
2213 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2214 val += vp56_rac_get_prob(c, 145);
2218 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2219 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2220 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2221 val += (vp56_rac_get_prob(c, 148) << 1);
2222 val += vp56_rac_get_prob(c, 140);
2224 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2225 val += (vp56_rac_get_prob(c, 155) << 2);
2226 val += (vp56_rac_get_prob(c, 140) << 1);
2227 val += vp56_rac_get_prob(c, 135);
2229 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2230 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2231 val += (vp56_rac_get_prob(c, 157) << 3);
2232 val += (vp56_rac_get_prob(c, 141) << 2);
2233 val += (vp56_rac_get_prob(c, 134) << 1);
2234 val += vp56_rac_get_prob(c, 130);
2237 if (!is8bitsperpixel) {
2239 val += vp56_rac_get_prob(c, 255) << 17;
2240 val += vp56_rac_get_prob(c, 255) << 16;
2242 val += (vp56_rac_get_prob(c, 255) << 15);
2243 val += (vp56_rac_get_prob(c, 255) << 14);
2245 val += (vp56_rac_get_prob(c, 254) << 13);
2246 val += (vp56_rac_get_prob(c, 254) << 12);
2247 val += (vp56_rac_get_prob(c, 254) << 11);
2248 val += (vp56_rac_get_prob(c, 252) << 10);
2249 val += (vp56_rac_get_prob(c, 249) << 9);
2250 val += (vp56_rac_get_prob(c, 243) << 8);
2251 val += (vp56_rac_get_prob(c, 230) << 7);
2252 val += (vp56_rac_get_prob(c, 196) << 6);
2253 val += (vp56_rac_get_prob(c, 177) << 5);
2254 val += (vp56_rac_get_prob(c, 153) << 4);
2255 val += (vp56_rac_get_prob(c, 140) << 3);
2256 val += (vp56_rac_get_prob(c, 133) << 2);
2257 val += (vp56_rac_get_prob(c, 130) << 1);
2258 val += vp56_rac_get_prob(c, 129);
2262 #define STORE_COEF(c, i, v) do { \
2263 if (is8bitsperpixel) { \
2266 AV_WN32A(&c[i * 2], v); \
2270 band_left = band_counts[++band];
2272 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2274 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2275 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2277 } while (++i < n_coeffs);
2282 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2283 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2284 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2285 const int16_t (*nb)[2], const int16_t *band_counts,
2286 const int16_t *qmul)
2288 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2289 nnz, scan, nb, band_counts, qmul);
2292 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2293 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2294 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2295 const int16_t (*nb)[2], const int16_t *band_counts,
2296 const int16_t *qmul)
2298 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2299 nnz, scan, nb, band_counts, qmul);
2302 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2303 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2304 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2305 const int16_t (*nb)[2], const int16_t *band_counts,
2306 const int16_t *qmul)
2308 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2309 nnz, scan, nb, band_counts, qmul);
2312 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2313 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2314 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2315 const int16_t (*nb)[2], const int16_t *band_counts,
2316 const int16_t *qmul)
2318 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2319 nnz, scan, nb, band_counts, qmul);
2322 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2324 VP9Context *s = ctx->priv_data;
2326 int row = s->row, col = s->col;
2327 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2328 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2329 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2330 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2331 int end_x = FFMIN(2 * (s->cols - col), w4);
2332 int end_y = FFMIN(2 * (s->rows - row), h4);
2333 int n, pl, x, y, res;
2334 int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2335 int tx = 4 * s->lossless + b->tx;
2336 const int16_t * const *yscans = vp9_scans[tx];
2337 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2338 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2339 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2340 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2341 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2342 static const int16_t band_counts[4][8] = {
2343 { 1, 2, 3, 4, 3, 16 - 13 },
2344 { 1, 2, 3, 4, 11, 64 - 21 },
2345 { 1, 2, 3, 4, 11, 256 - 21 },
2346 { 1, 2, 3, 4, 11, 1024 - 21 },
2348 const int16_t *y_band_counts = band_counts[b->tx];
2349 const int16_t *uv_band_counts = band_counts[b->uvtx];
2350 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2351 int total_coeff = 0;
2353 #define MERGE(la, end, step, rd) \
2354 for (n = 0; n < end; n += step) \
2355 la[n] = !!rd(&la[n])
2356 #define MERGE_CTX(step, rd) \
2358 MERGE(l, end_y, step, rd); \
2359 MERGE(a, end_x, step, rd); \
2362 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2363 for (n = 0, y = 0; y < end_y; y += step) { \
2364 for (x = 0; x < end_x; x += step, n += step * step) { \
2365 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2366 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2367 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2368 c, e, p, a[x] + l[y], yscans[txtp], \
2369 ynbs[txtp], y_band_counts, qmul[0]); \
2370 a[x] = l[y] = !!res; \
2371 total_coeff |= !!res; \
2373 AV_WN16A(&s->eob[n], res); \
2380 #define SPLAT(la, end, step, cond) \
2382 for (n = 1; n < end; n += step) \
2383 la[n] = la[n - 1]; \
2384 } else if (step == 4) { \
2386 for (n = 0; n < end; n += step) \
2387 AV_WN32A(&la[n], la[n] * 0x01010101); \
2389 for (n = 0; n < end; n += step) \
2390 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2392 } else /* step == 8 */ { \
2394 if (HAVE_FAST_64BIT) { \
2395 for (n = 0; n < end; n += step) \
2396 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2398 for (n = 0; n < end; n += step) { \
2399 uint32_t v32 = la[n] * 0x01010101; \
2400 AV_WN32A(&la[n], v32); \
2401 AV_WN32A(&la[n + 4], v32); \
2405 for (n = 0; n < end; n += step) \
2406 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2409 #define SPLAT_CTX(step) \
2411 SPLAT(a, end_x, step, end_x == w4); \
2412 SPLAT(l, end_y, step, end_y == h4); \
2418 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2421 MERGE_CTX(2, AV_RN16A);
2422 DECODE_Y_COEF_LOOP(2, 0,);
2426 MERGE_CTX(4, AV_RN32A);
2427 DECODE_Y_COEF_LOOP(4, 0,);
2431 MERGE_CTX(8, AV_RN64A);
2432 DECODE_Y_COEF_LOOP(8, 0, 32);
2437 #define DECODE_UV_COEF_LOOP(step, v) \
2438 for (n = 0, y = 0; y < end_y; y += step) { \
2439 for (x = 0; x < end_x; x += step, n += step * step) { \
2440 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2441 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2442 16 * step * step, c, e, p, a[x] + l[y], \
2443 uvscan, uvnb, uv_band_counts, qmul[1]); \
2444 a[x] = l[y] = !!res; \
2445 total_coeff |= !!res; \
2447 AV_WN16A(&s->uveob[pl][n], res); \
2449 s->uveob[pl][n] = res; \
2454 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2455 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2456 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2461 for (pl = 0; pl < 2; pl++) {
2462 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2463 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2466 DECODE_UV_COEF_LOOP(1,);
2469 MERGE_CTX(2, AV_RN16A);
2470 DECODE_UV_COEF_LOOP(2,);
2474 MERGE_CTX(4, AV_RN32A);
2475 DECODE_UV_COEF_LOOP(4,);
2479 MERGE_CTX(8, AV_RN64A);
2480 DECODE_UV_COEF_LOOP(8, 32);
2489 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2491 return decode_coeffs(ctx, 1);
2494 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2496 return decode_coeffs(ctx, 0);
2499 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2500 uint8_t *dst_edge, ptrdiff_t stride_edge,
2501 uint8_t *dst_inner, ptrdiff_t stride_inner,
2502 uint8_t *l, int col, int x, int w,
2503 int row, int y, enum TxfmMode tx,
2504 int p, int ss_h, int ss_v, int bytesperpixel)
2506 int have_top = row > 0 || y > 0;
2507 int have_left = col > s->tiling.tile_col_start || x > 0;
2508 int have_right = x < w - 1;
2510 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2511 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2512 { DC_127_PRED, VERT_PRED } },
2513 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2514 { HOR_PRED, HOR_PRED } },
2515 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2516 { LEFT_DC_PRED, DC_PRED } },
2517 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2518 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2519 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2520 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2521 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2522 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2523 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2524 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2525 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2526 { DC_127_PRED, VERT_LEFT_PRED } },
2527 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2528 { HOR_UP_PRED, HOR_UP_PRED } },
2529 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2530 { HOR_PRED, TM_VP8_PRED } },
2532 static const struct {
2533 uint8_t needs_left:1;
2534 uint8_t needs_top:1;
2535 uint8_t needs_topleft:1;
2536 uint8_t needs_topright:1;
2537 uint8_t invert_left:1;
2538 } edges[N_INTRA_PRED_MODES] = {
2539 [VERT_PRED] = { .needs_top = 1 },
2540 [HOR_PRED] = { .needs_left = 1 },
2541 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2542 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2543 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2544 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2545 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2546 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2547 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2548 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2549 [LEFT_DC_PRED] = { .needs_left = 1 },
2550 [TOP_DC_PRED] = { .needs_top = 1 },
2551 [DC_128_PRED] = { 0 },
2552 [DC_127_PRED] = { 0 },
2553 [DC_129_PRED] = { 0 }
2556 av_assert2(mode >= 0 && mode < 10);
2557 mode = mode_conv[mode][have_left][have_top];
2558 if (edges[mode].needs_top) {
2559 uint8_t *top, *topleft;
2560 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2561 int n_px_need_tr = 0;
2563 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2566 // if top of sb64-row, use s->intra_pred_data[] instead of
2567 // dst[-stride] for intra prediction (it contains pre- instead of
2568 // post-loopfilter data)
2570 top = !(row & 7) && !y ?
2571 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2572 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2574 topleft = !(row & 7) && !y ?
2575 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2576 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2577 &dst_inner[-stride_inner];
2581 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2582 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2583 n_px_need + n_px_need_tr <= n_px_have) {
2587 if (n_px_need <= n_px_have) {
2588 memcpy(*a, top, n_px_need * bytesperpixel);
2590 #define memset_bpp(c, i1, v, i2, num) do { \
2591 if (bytesperpixel == 1) { \
2592 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2594 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2595 for (n = 0; n < (num); n++) { \
2596 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2600 memcpy(*a, top, n_px_have * bytesperpixel);
2601 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2604 #define memset_val(c, val, num) do { \
2605 if (bytesperpixel == 1) { \
2606 memset((c), (val), (num)); \
2609 for (n = 0; n < (num); n++) { \
2610 AV_WN16A(&(c)[n * 2], (val)); \
2614 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2616 if (edges[mode].needs_topleft) {
2617 if (have_left && have_top) {
2618 #define assign_bpp(c, i1, v, i2) do { \
2619 if (bytesperpixel == 1) { \
2620 (c)[(i1)] = (v)[(i2)]; \
2622 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2625 assign_bpp(*a, -1, topleft, -1);
2627 #define assign_val(c, i, v) do { \
2628 if (bytesperpixel == 1) { \
2631 AV_WN16A(&(c)[(i) * 2], (v)); \
2634 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2637 if (tx == TX_4X4 && edges[mode].needs_topright) {
2638 if (have_top && have_right &&
2639 n_px_need + n_px_need_tr <= n_px_have) {
2640 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2642 memset_bpp(*a, 4, *a, 3, 4);
2647 if (edges[mode].needs_left) {
2649 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2650 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2651 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2653 if (edges[mode].invert_left) {
2654 if (n_px_need <= n_px_have) {
2655 for (i = 0; i < n_px_need; i++)
2656 assign_bpp(l, i, &dst[i * stride], -1);
2658 for (i = 0; i < n_px_have; i++)
2659 assign_bpp(l, i, &dst[i * stride], -1);
2660 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2663 if (n_px_need <= n_px_have) {
2664 for (i = 0; i < n_px_need; i++)
2665 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2667 for (i = 0; i < n_px_have; i++)
2668 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2669 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2673 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2680 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2681 ptrdiff_t uv_off, int bytesperpixel)
2683 VP9Context *s = ctx->priv_data;
2685 int row = s->row, col = s->col;
2686 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2687 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2688 int end_x = FFMIN(2 * (s->cols - col), w4);
2689 int end_y = FFMIN(2 * (s->rows - row), h4);
2690 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2691 int uvstep1d = 1 << b->uvtx, p;
2692 uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2693 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2694 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2696 for (n = 0, y = 0; y < end_y; y += step1d) {
2697 uint8_t *ptr = dst, *ptr_r = dst_r;
2698 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2699 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2700 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2702 uint8_t *a = &a_buf[32];
2703 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2704 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2706 mode = check_intra_mode(s, mode, &a, ptr_r,
2707 s->frames[CUR_FRAME].tf.f->linesize[0],
2708 ptr, s->y_stride, l,
2709 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2710 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2712 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2713 s->block + 16 * n * bytesperpixel, eob);
2715 dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2716 dst += 4 * step1d * s->y_stride;
2723 step = 1 << (b->uvtx * 2);
2724 for (p = 0; p < 2; p++) {
2725 dst = s->dst[1 + p];
2726 dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2727 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2728 uint8_t *ptr = dst, *ptr_r = dst_r;
2729 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2730 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2731 int mode = b->uvmode;
2732 uint8_t *a = &a_buf[32];
2733 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2735 mode = check_intra_mode(s, mode, &a, ptr_r,
2736 s->frames[CUR_FRAME].tf.f->linesize[1],
2737 ptr, s->uv_stride, l, col, x, w4, row, y,
2738 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2739 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2741 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2742 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2744 dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2745 dst += 4 * uvstep1d * s->uv_stride;
2750 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2752 intra_recon(ctx, y_off, uv_off, 1);
2755 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2757 intra_recon(ctx, y_off, uv_off, 2);
2760 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2761 uint8_t *dst, ptrdiff_t dst_stride,
2762 const uint8_t *ref, ptrdiff_t ref_stride,
2763 ThreadFrame *ref_frame,
2764 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2765 int px, int py, int pw, int ph,
2766 int bw, int bh, int w, int h, int bytesperpixel,
2767 const uint16_t *scale, const uint8_t *step)
2769 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2771 int refbw_m1, refbh_m1;
2775 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2776 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2777 // BUG libvpx seems to scale the two components separately. This introduces
2778 // rounding errors but we have to reproduce them to be exactly compatible
2779 // with the output from libvpx...
2780 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2781 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2785 ref += y * ref_stride + x * bytesperpixel;
2788 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2789 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2790 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2791 // we use +7 because the last 7 pixels of each sbrow can be changed in
2792 // the longest loopfilter of the next sbrow
2793 th = (y + refbh_m1 + 4 + 7) >> 6;
2794 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2795 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2796 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2797 ref - 3 * ref_stride - 3 * bytesperpixel,
2799 refbw_m1 + 8, refbh_m1 + 8,
2800 x - 3, y - 3, w, h);
2801 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2804 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2807 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2808 uint8_t *dst_u, uint8_t *dst_v,
2809 ptrdiff_t dst_stride,
2810 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2811 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2812 ThreadFrame *ref_frame,
2813 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2814 int px, int py, int pw, int ph,
2815 int bw, int bh, int w, int h, int bytesperpixel,
2816 const uint16_t *scale, const uint8_t *step)
2819 int refbw_m1, refbh_m1;
2824 // BUG https://code.google.com/p/webm/issues/detail?id=820
2825 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2826 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2828 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2829 mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2832 // BUG https://code.google.com/p/webm/issues/detail?id=820
2833 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2834 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2836 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2837 my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2842 ref_u += y * src_stride_u + x * bytesperpixel;
2843 ref_v += y * src_stride_v + x * bytesperpixel;
2846 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2847 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2848 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2849 // we use +7 because the last 7 pixels of each sbrow can be changed in
2850 // the longest loopfilter of the next sbrow
2851 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2852 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2853 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2854 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2855 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2857 refbw_m1 + 8, refbh_m1 + 8,
2858 x - 3, y - 3, w, h);
2859 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2860 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2862 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2863 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2865 refbw_m1 + 8, refbh_m1 + 8,
2866 x - 3, y - 3, w, h);
2867 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2868 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2870 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2871 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2875 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2876 px, py, pw, ph, bw, bh, w, h, i) \
2877 mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
2878 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2879 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2880 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2881 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2882 mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2883 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2884 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2886 #define FN(x) x##_scaled_8bpp
2887 #define BYTES_PER_PIXEL 1
2888 #include "vp9_mc_template.c"
2890 #undef BYTES_PER_PIXEL
2891 #define FN(x) x##_scaled_16bpp
2892 #define BYTES_PER_PIXEL 2
2893 #include "vp9_mc_template.c"
2895 #undef mc_chroma_dir
2897 #undef BYTES_PER_PIXEL
2900 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2901 uint8_t *dst, ptrdiff_t dst_stride,
2902 const uint8_t *ref, ptrdiff_t ref_stride,
2903 ThreadFrame *ref_frame,
2904 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2905 int bw, int bh, int w, int h, int bytesperpixel)
2907 int mx = mv->x, my = mv->y, th;
2911 ref += y * ref_stride + x * bytesperpixel;
2914 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2915 // we use +7 because the last 7 pixels of each sbrow can be changed in
2916 // the longest loopfilter of the next sbrow
2917 th = (y + bh + 4 * !!my + 7) >> 6;
2918 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2919 if (x < !!mx * 3 || y < !!my * 3 ||
2920 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2921 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2922 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2924 bw + !!mx * 7, bh + !!my * 7,
2925 x - !!mx * 3, y - !!my * 3, w, h);
2926 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2929 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2932 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2933 uint8_t *dst_u, uint8_t *dst_v,
2934 ptrdiff_t dst_stride,
2935 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2936 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2937 ThreadFrame *ref_frame,
2938 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2939 int bw, int bh, int w, int h, int bytesperpixel)
2941 int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2945 ref_u += y * src_stride_u + x * bytesperpixel;
2946 ref_v += y * src_stride_v + x * bytesperpixel;
2949 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2950 // we use +7 because the last 7 pixels of each sbrow can be changed in
2951 // the longest loopfilter of the next sbrow
2952 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2953 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2954 if (x < !!mx * 3 || y < !!my * 3 ||
2955 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2956 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2957 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2959 bw + !!mx * 7, bh + !!my * 7,
2960 x - !!mx * 3, y - !!my * 3, w, h);
2961 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2962 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2964 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2965 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2967 bw + !!mx * 7, bh + !!my * 7,
2968 x - !!mx * 3, y - !!my * 3, w, h);
2969 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2970 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2972 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2973 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2977 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2978 px, py, pw, ph, bw, bh, w, h, i) \
2979 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2980 mv, bw, bh, w, h, bytesperpixel)
2981 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2982 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2983 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2984 row, col, mv, bw, bh, w, h, bytesperpixel)
2986 #define FN(x) x##_8bpp
2987 #define BYTES_PER_PIXEL 1
2988 #include "vp9_mc_template.c"
2990 #undef BYTES_PER_PIXEL
2991 #define FN(x) x##_16bpp
2992 #define BYTES_PER_PIXEL 2
2993 #include "vp9_mc_template.c"
2994 #undef mc_luma_dir_dir
2995 #undef mc_chroma_dir_dir
2997 #undef BYTES_PER_PIXEL
3000 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3002 VP9Context *s = ctx->priv_data;
3004 int row = s->row, col = s->col;
3006 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3007 if (bytesperpixel == 1) {
3008 inter_pred_scaled_8bpp(ctx);
3010 inter_pred_scaled_16bpp(ctx);
3013 if (bytesperpixel == 1) {
3014 inter_pred_8bpp(ctx);
3016 inter_pred_16bpp(ctx);
3020 /* mostly copied intra_recon() */
3022 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3023 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3024 int end_x = FFMIN(2 * (s->cols - col), w4);
3025 int end_y = FFMIN(2 * (s->rows - row), h4);
3026 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
3027 int uvstep1d = 1 << b->uvtx, p;
3028 uint8_t *dst = s->dst[0];
3031 for (n = 0, y = 0; y < end_y; y += step1d) {
3033 for (x = 0; x < end_x; x += step1d,
3034 ptr += 4 * step1d * bytesperpixel, n += step) {
3035 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3038 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3039 s->block + 16 * n * bytesperpixel, eob);
3041 dst += 4 * s->y_stride * step1d;
3047 step = 1 << (b->uvtx * 2);
3048 for (p = 0; p < 2; p++) {
3049 dst = s->dst[p + 1];
3050 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3052 for (x = 0; x < end_x; x += uvstep1d,
3053 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3054 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3057 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3058 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3060 dst += 4 * uvstep1d * s->uv_stride;
3066 static void inter_recon_8bpp(AVCodecContext *ctx)
3068 inter_recon(ctx, 1);
3071 static void inter_recon_16bpp(AVCodecContext *ctx)
3073 inter_recon(ctx, 2);
3076 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3077 int row_and_7, int col_and_7,
3078 int w, int h, int col_end, int row_end,
3079 enum TxfmMode tx, int skip_inter)
3081 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3082 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3084 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3085 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3086 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3087 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3089 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3090 // edges. This means that for UV, we work on two subsampled blocks at
3091 // a time, and we only use the topleft block's mode information to set
3092 // things like block strength. Thus, for any block size smaller than
3093 // 16x16, ignore the odd portion of the block.
3094 if (tx == TX_4X4 && (ss_v | ss_h)) {
3109 if (tx == TX_4X4 && !skip_inter) {
3110 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3111 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3112 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3114 for (y = row_and_7; y < h + row_and_7; y++) {
3115 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3117 mask[0][y][1] |= m_row_8;
3118 mask[0][y][2] |= m_row_4;
3119 // for odd lines, if the odd col is not being filtered,
3120 // skip odd row also:
3127 // if a/c are even row/col and b/d are odd, and d is skipped,
3128 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3129 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3130 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3132 mask[1][y][col_mask_id] |= m_col;
3135 mask[0][y][3] |= m_col;
3137 if (ss_h && (col_end & 1))
3138 mask[1][y][3] |= (t << (w - 1)) - t;
3140 mask[1][y][3] |= m_col;
3144 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3147 int mask_id = (tx == TX_8X8);
3148 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3149 int l2 = tx + ss_h - 1, step1d;
3150 int m_row = m_col & masks[l2];
3152 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3153 // 8wd loopfilter to prevent going off the visible edge.
3154 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3155 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3156 int m_row_8 = m_row - m_row_16;
3158 for (y = row_and_7; y < h + row_and_7; y++) {
3159 mask[0][y][0] |= m_row_16;
3160 mask[0][y][1] |= m_row_8;
3163 for (y = row_and_7; y < h + row_and_7; y++)
3164 mask[0][y][mask_id] |= m_row;
3169 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3170 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3171 mask[1][y][0] |= m_col;
3172 if (y - row_and_7 == h - 1)
3173 mask[1][y][1] |= m_col;
3175 for (y = row_and_7; y < h + row_and_7; y += step1d)
3176 mask[1][y][mask_id] |= m_col;
3178 } else if (tx != TX_4X4) {
3181 mask_id = (tx == TX_8X8) || (h == ss_v);
3182 mask[1][row_and_7][mask_id] |= m_col;
3183 mask_id = (tx == TX_8X8) || (w == ss_h);
3184 for (y = row_and_7; y < h + row_and_7; y++)
3185 mask[0][y][mask_id] |= t;
3187 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3189 for (y = row_and_7; y < h + row_and_7; y++) {
3190 mask[0][y][2] |= t4;
3191 mask[0][y][1] |= t8;
3193 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3198 static void decode_b(AVCodecContext *ctx, int row, int col,
3199 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3200 enum BlockLevel bl, enum BlockPartition bp)
3202 VP9Context *s = ctx->priv_data;
3204 enum BlockSize bs = bl * 3 + bp;
3205 int bytesperpixel = s->bytesperpixel;
3206 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3208 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3214 s->min_mv.x = -(128 + col * 64);
3215 s->min_mv.y = -(128 + row * 64);
3216 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3217 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3223 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3224 (s->ss_v && h4 * 2 == (1 << b->tx)));
3229 if (bytesperpixel == 1) {
3230 has_coeffs = decode_coeffs_8bpp(ctx);
3232 has_coeffs = decode_coeffs_16bpp(ctx);
3234 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3236 memset(&s->above_skip_ctx[col], 1, w4);
3237 memset(&s->left_skip_ctx[s->row7], 1, h4);
3242 #define SPLAT_ZERO_CTX(v, n) \
3244 case 1: v = 0; break; \
3245 case 2: AV_ZERO16(&v); break; \
3246 case 4: AV_ZERO32(&v); break; \
3247 case 8: AV_ZERO64(&v); break; \
3248 case 16: AV_ZERO128(&v); break; \
3250 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3252 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3253 if (s->ss_##dir2) { \
3254 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3255 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3257 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3258 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3263 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3264 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3265 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3266 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3269 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3270 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3271 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3272 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3277 s->block += w4 * h4 * 64 * bytesperpixel;
3278 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3279 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3280 s->eob += 4 * w4 * h4;
3281 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3282 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3288 // emulated overhangs if the stride of the target buffer can't hold. This
3289 // allows to support emu-edge and so on even if we have large block
3291 emu[0] = (col + w4) * 8 > f->linesize[0] ||
3292 (row + h4) > s->rows;
3293 emu[1] = (col + w4) * 4 > f->linesize[1] ||
3294 (row + h4) > s->rows;
3296 s->dst[0] = s->tmp_y;
3299 s->dst[0] = f->data[0] + yoff;
3300 s->y_stride = f->linesize[0];
3303 s->dst[1] = s->tmp_uv[0];
3304 s->dst[2] = s->tmp_uv[1];
3307 s->dst[1] = f->data[1] + uvoff;
3308 s->dst[2] = f->data[2] + uvoff;
3309 s->uv_stride = f->linesize[1];
3313 intra_recon_16bpp(ctx, yoff, uvoff);
3315 intra_recon_8bpp(ctx, yoff, uvoff);
3319 inter_recon_16bpp(ctx);
3321 inter_recon_8bpp(ctx);
3325 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3327 for (n = 0; o < w; n++) {
3332 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3333 s->tmp_y + o, 128, h, 0, 0);
3334 o += bw * bytesperpixel;
3339 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3340 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3342 for (n = s->ss_h; o < w; n++) {
3347 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3348 s->tmp_uv[0] + o, 128, h, 0, 0);
3349 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3350 s->tmp_uv[1] + o, 128, h, 0, 0);
3351 o += bw * bytesperpixel;
3356 // pick filter level and find edges to apply filter to
3357 if (s->filter.level &&
3358 (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3359 [b->mode[3] != ZEROMV]) > 0) {
3360 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3361 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3363 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3364 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3365 if (s->ss_h || s->ss_v)
3366 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3367 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3368 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3369 b->uvtx, skip_inter);
3371 if (!s->filter.lim_lut[lvl]) {
3372 int sharp = s->filter.sharpness;
3376 limit >>= (sharp + 3) >> 2;
3377 limit = FFMIN(limit, 9 - sharp);
3379 limit = FFMAX(limit, 1);
3381 s->filter.lim_lut[lvl] = limit;
3382 s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3388 s->block += w4 * h4 * 64 * bytesperpixel;
3389 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3390 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3391 s->eob += 4 * w4 * h4;
3392 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3393 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3397 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3398 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3400 VP9Context *s = ctx->priv_data;
3401 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3402 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3403 const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
3404 s->prob.p.partition[bl][c];
3405 enum BlockPartition bp;
3406 ptrdiff_t hbs = 4 >> bl;
3407 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3408 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3409 int bytesperpixel = s->bytesperpixel;
3412 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3413 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3414 } else if (col + hbs < s->cols) { // FIXME why not <=?
3415 if (row + hbs < s->rows) { // FIXME why not <=?
3416 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3418 case PARTITION_NONE:
3419 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3422 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3423 yoff += hbs * 8 * y_stride;
3424 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3425 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3428 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3429 yoff += hbs * 8 * bytesperpixel;
3430 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3431 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3433 case PARTITION_SPLIT:
3434 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3435 decode_sb(ctx, row, col + hbs, lflvl,
3436 yoff + 8 * hbs * bytesperpixel,
3437 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3438 yoff += hbs * 8 * y_stride;
3439 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3440 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3441 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3442 yoff + 8 * hbs * bytesperpixel,
3443 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3448 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3449 bp = PARTITION_SPLIT;
3450 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3451 decode_sb(ctx, row, col + hbs, lflvl,
3452 yoff + 8 * hbs * bytesperpixel,
3453 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3456 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3458 } else if (row + hbs < s->rows) { // FIXME why not <=?
3459 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3460 bp = PARTITION_SPLIT;
3461 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3462 yoff += hbs * 8 * y_stride;
3463 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3464 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3467 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3470 bp = PARTITION_SPLIT;
3471 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3473 s->counts.partition[bl][c][bp]++;
3476 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3477 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3479 VP9Context *s = ctx->priv_data;
3481 ptrdiff_t hbs = 4 >> bl;
3482 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3483 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3484 int bytesperpixel = s->bytesperpixel;
3487 av_assert2(b->bl == BL_8X8);
3488 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3489 } else if (s->b->bl == bl) {
3490 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3491 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3492 yoff += hbs * 8 * y_stride;
3493 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3494 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3495 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3496 yoff += hbs * 8 * bytesperpixel;
3497 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3498 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3501 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3502 if (col + hbs < s->cols) { // FIXME why not <=?
3503 if (row + hbs < s->rows) {
3504 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3505 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3506 yoff += hbs * 8 * y_stride;
3507 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3508 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3509 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3510 yoff + 8 * hbs * bytesperpixel,
3511 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3513 yoff += hbs * 8 * bytesperpixel;
3514 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3515 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3517 } else if (row + hbs < s->rows) {
3518 yoff += hbs * 8 * y_stride;
3519 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3520 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3525 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3526 uint8_t *lvl, uint8_t (*mask)[4],
3527 uint8_t *dst, ptrdiff_t ls)
3529 int y, x, bytesperpixel = s->bytesperpixel;
3531 // filter edges between columns (e.g. block1 | block2)
3532 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3533 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3534 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3535 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3536 unsigned hm = hm1 | hm2 | hm13 | hm23;
3538 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3541 int L = *l, H = L >> 4;
3542 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3544 if (hmask1[0] & x) {
3545 if (hmask2[0] & x) {
3546 av_assert2(l[8 << ss_v] == L);
3547 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3549 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3551 } else if (hm2 & x) {
3554 E |= s->filter.mblim_lut[L] << 8;
3555 I |= s->filter.lim_lut[L] << 8;
3556 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3558 [0](ptr, ls, E, I, H);
3560 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3561 [0](ptr, ls, E, I, H);
3563 } else if (hm2 & x) {
3564 int L = l[8 << ss_v], H = L >> 4;
3565 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3567 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3568 [0](ptr + 8 * ls, ls, E, I, H);
3576 int L = *l, H = L >> 4;
3577 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3582 E |= s->filter.mblim_lut[L] << 8;
3583 I |= s->filter.lim_lut[L] << 8;
3584 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3586 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3588 } else if (hm23 & x) {
3589 int L = l[8 << ss_v], H = L >> 4;
3590 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3592 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3600 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3601 uint8_t *lvl, uint8_t (*mask)[4],
3602 uint8_t *dst, ptrdiff_t ls)
3604 int y, x, bytesperpixel = s->bytesperpixel;
3607 // filter edges between rows (e.g. ------)
3609 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3610 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3611 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3613 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3616 int L = *l, H = L >> 4;
3617 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3620 if (vmask[0] & (x << (1 + ss_h))) {
3621 av_assert2(l[1 + ss_h] == L);
3622 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3624 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3626 } else if (vm & (x << (1 + ss_h))) {
3629 E |= s->filter.mblim_lut[L] << 8;
3630 I |= s->filter.lim_lut[L] << 8;
3631 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3632 [!!(vmask[1] & (x << (1 + ss_h)))]
3633 [1](ptr, ls, E, I, H);
3635 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3636 [1](ptr, ls, E, I, H);
3638 } else if (vm & (x << (1 + ss_h))) {
3639 int L = l[1 + ss_h], H = L >> 4;
3640 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3642 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3643 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3648 int L = *l, H = L >> 4;
3649 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3651 if (vm3 & (x << (1 + ss_h))) {
3654 E |= s->filter.mblim_lut[L] << 8;
3655 I |= s->filter.lim_lut[L] << 8;
3656 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3658 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3660 } else if (vm3 & (x << (1 + ss_h))) {
3661 int L = l[1 + ss_h], H = L >> 4;
3662 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3664 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3677 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3678 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3680 VP9Context *s = ctx->priv_data;
3681 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3682 uint8_t *dst = f->data[0] + yoff;
3683 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3684 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3687 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3688 // if you think of them as acting on a 8x8 block max, we can interleave
3689 // each v/h within the single x loop, but that only works if we work on
3690 // 8 pixel blocks, and we won't always do that (we want at least 16px
3691 // to use SSE2 optimizations, perhaps 32 for AVX2)
3693 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3694 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3696 for (p = 0; p < 2; p++) {
3697 dst = f->data[1 + p] + uvoff;
3698 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3699 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3703 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3705 int sb_start = ( idx * n) >> log2_n;
3706 int sb_end = ((idx + 1) * n) >> log2_n;
3707 *start = FFMIN(sb_start, n) << 3;
3708 *end = FFMIN(sb_end, n) << 3;
3711 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3712 int max_count, int update_factor)
3714 unsigned ct = ct0 + ct1, p2, p1;
3720 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3721 p2 = av_clip(p2, 1, 255);
3722 ct = FFMIN(ct, max_count);
3723 update_factor = FASTDIV(update_factor * ct, max_count);
3725 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3726 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3729 static void adapt_probs(VP9Context *s)
3732 prob_context *p = &s->prob_ctx[s->framectxid].p;
3733 int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3736 for (i = 0; i < 4; i++)
3737 for (j = 0; j < 2; j++)
3738 for (k = 0; k < 2; k++)
3739 for (l = 0; l < 6; l++)
3740 for (m = 0; m < 6; m++) {
3741 uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3742 unsigned *e = s->counts.eob[i][j][k][l][m];
3743 unsigned *c = s->counts.coef[i][j][k][l][m];
3745 if (l == 0 && m >= 3) // dc only has 3 pt
3748 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3749 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3750 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3753 if (s->keyframe || s->intraonly) {
3754 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3755 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3756 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3757 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3762 for (i = 0; i < 3; i++)
3763 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3766 for (i = 0; i < 4; i++)
3767 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3770 if (s->comppredmode == PRED_SWITCHABLE) {
3771 for (i = 0; i < 5; i++)
3772 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3776 if (s->comppredmode != PRED_SINGLEREF) {
3777 for (i = 0; i < 5; i++)
3778 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3779 s->counts.comp_ref[i][1], 20, 128);
3782 if (s->comppredmode != PRED_COMPREF) {
3783 for (i = 0; i < 5; i++) {
3784 uint8_t *pp = p->single_ref[i];
3785 unsigned (*c)[2] = s->counts.single_ref[i];
3787 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3788 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3792 // block partitioning
3793 for (i = 0; i < 4; i++)
3794 for (j = 0; j < 4; j++) {
3795 uint8_t *pp = p->partition[i][j];
3796 unsigned *c = s->counts.partition[i][j];
3798 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3799 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3800 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3804 if (s->txfmmode == TX_SWITCHABLE) {
3805 for (i = 0; i < 2; i++) {
3806 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3808 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3809 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3810 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3811 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3812 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3813 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3817 // interpolation filter
3818 if (s->filtermode == FILTER_SWITCHABLE) {
3819 for (i = 0; i < 4; i++) {
3820 uint8_t *pp = p->filter[i];
3821 unsigned *c = s->counts.filter[i];
3823 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3824 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3829 for (i = 0; i < 7; i++) {
3830 uint8_t *pp = p->mv_mode[i];
3831 unsigned *c = s->counts.mv_mode[i];
3833 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3834 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3835 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3840 uint8_t *pp = p->mv_joint;
3841 unsigned *c = s->counts.mv_joint;
3843 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3844 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3845 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3849 for (i = 0; i < 2; i++) {
3851 unsigned *c, (*c2)[2], sum;
3853 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3854 s->counts.mv_comp[i].sign[1], 20, 128);
3856 pp = p->mv_comp[i].classes;
3857 c = s->counts.mv_comp[i].classes;
3858 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3859 adapt_prob(&pp[0], c[0], sum, 20, 128);
3861 adapt_prob(&pp[1], c[1], sum, 20, 128);
3863 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3864 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3866 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3867 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3869 adapt_prob(&pp[6], c[6], sum, 20, 128);
3870 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3871 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3872 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3874 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3875 s->counts.mv_comp[i].class0[1], 20, 128);
3876 pp = p->mv_comp[i].bits;
3877 c2 = s->counts.mv_comp[i].bits;
3878 for (j = 0; j < 10; j++)
3879 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3881 for (j = 0; j < 2; j++) {
3882 pp = p->mv_comp[i].class0_fp[j];
3883 c = s->counts.mv_comp[i].class0_fp[j];
3884 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3885 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3886 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3888 pp = p->mv_comp[i].fp;
3889 c = s->counts.mv_comp[i].fp;
3890 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3891 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3892 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3894 if (s->highprecisionmvs) {
3895 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3896 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3897 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3898 s->counts.mv_comp[i].hp[1], 20, 128);
3903 for (i = 0; i < 4; i++) {
3904 uint8_t *pp = p->y_mode[i];
3905 unsigned *c = s->counts.y_mode[i], sum, s2;
3907 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3908 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3909 sum -= c[TM_VP8_PRED];
3910 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3911 sum -= c[VERT_PRED];
3912 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3913 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3915 adapt_prob(&pp[3], s2, sum, 20, 128);
3917 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3918 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3919 sum -= c[DIAG_DOWN_LEFT_PRED];
3920 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3921 sum -= c[VERT_LEFT_PRED];
3922 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3923 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3927 for (i = 0; i < 10; i++) {
3928 uint8_t *pp = p->uv_mode[i];
3929 unsigned *c = s->counts.uv_mode[i], sum, s2;
3931 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3932 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3933 sum -= c[TM_VP8_PRED];
3934 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3935 sum -= c[VERT_PRED];
3936 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3937 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3939 adapt_prob(&pp[3], s2, sum, 20, 128);
3941 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3942 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3943 sum -= c[DIAG_DOWN_LEFT_PRED];
3944 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3945 sum -= c[VERT_LEFT_PRED];
3946 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3947 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3951 static void free_buffers(VP9Context *s)
3953 av_freep(&s->intra_pred_data[0]);
3954 av_freep(&s->b_base);
3955 av_freep(&s->block_base);
3958 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3960 VP9Context *s = ctx->priv_data;
3963 for (i = 0; i < 3; i++) {
3964 if (s->frames[i].tf.f->data[0])
3965 vp9_unref_frame(ctx, &s->frames[i]);
3966 av_frame_free(&s->frames[i].tf.f);
3968 for (i = 0; i < 8; i++) {
3969 if (s->refs[i].f->data[0])
3970 ff_thread_release_buffer(ctx, &s->refs[i]);
3971 av_frame_free(&s->refs[i].f);
3972 if (s->next_refs[i].f->data[0])
3973 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3974 av_frame_free(&s->next_refs[i].f);
3984 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3985 int *got_frame, AVPacket *pkt)
3987 const uint8_t *data = pkt->data;
3988 int size = pkt->size;
3989 VP9Context *s = ctx->priv_data;
3990 int res, tile_row, tile_col, i, ref, row, col;
3991 int retain_segmap_ref = s->segmentation.enabled && !s->segmentation.update_map;
3992 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3996 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3998 } else if (res == 0) {
3999 if (!s->refs[ref].f->data[0]) {
4000 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4001 return AVERROR_INVALIDDATA;
4003 if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
4005 ((AVFrame *)frame)->pkt_pts = pkt->pts;
4006 ((AVFrame *)frame)->pkt_dts = pkt->dts;
4007 for (i = 0; i < 8; i++) {
4008 if (s->next_refs[i].f->data[0])
4009 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4010 if (s->refs[i].f->data[0] &&
4011 (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
4020 if (!retain_segmap_ref) {
4021 if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0])
4022 vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
4023 if (!s->keyframe && !s->intraonly && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4024 (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_SEGMAP], &s->frames[CUR_FRAME])) < 0)
4027 if (s->frames[REF_FRAME_MVPAIR].tf.f->data[0])
4028 vp9_unref_frame(ctx, &s->frames[REF_FRAME_MVPAIR]);
4029 if (!s->intraonly && !s->keyframe && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4030 (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_MVPAIR], &s->frames[CUR_FRAME])) < 0)
4032 if (s->frames[CUR_FRAME].tf.f->data[0])
4033 vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
4034 if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
4036 f = s->frames[CUR_FRAME].tf.f;
4037 f->key_frame = s->keyframe;
4038 f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4039 ls_y = f->linesize[0];
4040 ls_uv =f->linesize[1];
4043 for (i = 0; i < 8; i++) {
4044 if (s->next_refs[i].f->data[0])
4045 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4046 if (s->refreshrefmask & (1 << i)) {
4047 res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
4048 } else if (s->refs[i].f->data[0]) {
4049 res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
4055 // main tile decode loop
4056 bytesperpixel = s->bytesperpixel;
4057 memset(s->above_partition_ctx, 0, s->cols);
4058 memset(s->above_skip_ctx, 0, s->cols);
4059 if (s->keyframe || s->intraonly) {
4060 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4062 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4064 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4065 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4066 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4067 memset(s->above_segpred_ctx, 0, s->cols);
4068 s->pass = s->frames[CUR_FRAME].uses_2pass =
4069 ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
4070 if ((res = update_block_buffers(ctx)) < 0) {
4071 av_log(ctx, AV_LOG_ERROR,
4072 "Failed to allocate block buffers\n");
4075 if (s->refreshctx && s->parallelmode) {
4078 for (i = 0; i < 4; i++) {
4079 for (j = 0; j < 2; j++)
4080 for (k = 0; k < 2; k++)
4081 for (l = 0; l < 6; l++)
4082 for (m = 0; m < 6; m++)
4083 memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
4084 s->prob.coef[i][j][k][l][m], 3);
4085 if (s->txfmmode == i)
4088 s->prob_ctx[s->framectxid].p = s->prob.p;
4089 ff_thread_finish_setup(ctx);
4090 } else if (!s->refreshctx) {
4091 ff_thread_finish_setup(ctx);
4097 s->block = s->block_base;
4098 s->uvblock[0] = s->uvblock_base[0];
4099 s->uvblock[1] = s->uvblock_base[1];
4100 s->eob = s->eob_base;
4101 s->uveob[0] = s->uveob_base[0];
4102 s->uveob[1] = s->uveob_base[1];
4104 for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
4105 set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
4106 tile_row, s->tiling.log2_tile_rows, s->sb_rows);
4108 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4111 if (tile_col == s->tiling.tile_cols - 1 &&
4112 tile_row == s->tiling.tile_rows - 1) {
4115 tile_size = AV_RB32(data);
4119 if (tile_size > size) {
4120 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4121 return AVERROR_INVALIDDATA;
4123 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4124 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4125 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4126 return AVERROR_INVALIDDATA;
4133 for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
4134 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4135 struct VP9Filter *lflvl_ptr = s->lflvl;
4136 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4138 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4139 set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
4140 tile_col, s->tiling.log2_tile_cols, s->sb_cols);
4143 memset(s->left_partition_ctx, 0, 8);
4144 memset(s->left_skip_ctx, 0, 8);
4145 if (s->keyframe || s->intraonly) {
4146 memset(s->left_mode_ctx, DC_PRED, 16);
4148 memset(s->left_mode_ctx, NEARESTMV, 8);
4150 memset(s->left_y_nnz_ctx, 0, 16);
4151 memset(s->left_uv_nnz_ctx, 0, 32);
4152 memset(s->left_segpred_ctx, 0, 8);
4154 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4157 for (col = s->tiling.tile_col_start;
4158 col < s->tiling.tile_col_end;
4159 col += 8, yoff2 += 64 * bytesperpixel,
4160 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4161 // FIXME integrate with lf code (i.e. zero after each
4162 // use, similar to invtxfm coefficients, or similar)
4164 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4168 decode_sb_mem(ctx, row, col, lflvl_ptr,
4169 yoff2, uvoff2, BL_64X64);
4171 decode_sb(ctx, row, col, lflvl_ptr,
4172 yoff2, uvoff2, BL_64X64);
4176 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4184 // backup pre-loopfilter reconstruction data for intra
4185 // prediction of next row of sb64s
4186 if (row + 8 < s->rows) {
4187 memcpy(s->intra_pred_data[0],
4188 f->data[0] + yoff + 63 * ls_y,
4189 8 * s->cols * bytesperpixel);
4190 memcpy(s->intra_pred_data[1],
4191 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4192 8 * s->cols * bytesperpixel >> s->ss_h);
4193 memcpy(s->intra_pred_data[2],
4194 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4195 8 * s->cols * bytesperpixel >> s->ss_h);
4198 // loopfilter one row
4199 if (s->filter.level) {
4202 lflvl_ptr = s->lflvl;
4203 for (col = 0; col < s->cols;
4204 col += 8, yoff2 += 64 * bytesperpixel,
4205 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4206 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4210 // FIXME maybe we can make this more finegrained by running the
4211 // loopfilter per-block instead of after each sbrow
4212 // In fact that would also make intra pred left preparation easier?
4213 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
4217 if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
4219 ff_thread_finish_setup(ctx);
4221 } while (s->pass++ == 1);
4222 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4225 for (i = 0; i < 8; i++) {
4226 if (s->refs[i].f->data[0])
4227 ff_thread_release_buffer(ctx, &s->refs[i]);
4228 ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
4231 if (!s->invisible) {
4232 if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
4240 static void vp9_decode_flush(AVCodecContext *ctx)
4242 VP9Context *s = ctx->priv_data;
4245 for (i = 0; i < 3; i++)
4246 vp9_unref_frame(ctx, &s->frames[i]);
4247 for (i = 0; i < 8; i++)
4248 ff_thread_release_buffer(ctx, &s->refs[i]);
4251 static int init_frames(AVCodecContext *ctx)
4253 VP9Context *s = ctx->priv_data;
4256 for (i = 0; i < 3; i++) {
4257 s->frames[i].tf.f = av_frame_alloc();
4258 if (!s->frames[i].tf.f) {
4259 vp9_decode_free(ctx);
4260 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4261 return AVERROR(ENOMEM);
4264 for (i = 0; i < 8; i++) {
4265 s->refs[i].f = av_frame_alloc();
4266 s->next_refs[i].f = av_frame_alloc();
4267 if (!s->refs[i].f || !s->next_refs[i].f) {
4268 vp9_decode_free(ctx);
4269 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4270 return AVERROR(ENOMEM);
4277 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4279 VP9Context *s = ctx->priv_data;
4281 ctx->internal->allocate_progress = 1;
4283 s->filter.sharpness = -1;
4285 return init_frames(ctx);
4288 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4290 return init_frames(avctx);
4293 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4296 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4298 // detect size changes in other threads
4299 if (s->intra_pred_data[0] &&
4300 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4304 for (i = 0; i < 3; i++) {
4305 if (s->frames[i].tf.f->data[0])
4306 vp9_unref_frame(dst, &s->frames[i]);
4307 if (ssrc->frames[i].tf.f->data[0]) {
4308 if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4312 for (i = 0; i < 8; i++) {
4313 if (s->refs[i].f->data[0])
4314 ff_thread_release_buffer(dst, &s->refs[i]);
4315 if (ssrc->next_refs[i].f->data[0]) {
4316 if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4321 s->invisible = ssrc->invisible;
4322 s->keyframe = ssrc->keyframe;
4323 s->ss_v = ssrc->ss_v;
4324 s->ss_h = ssrc->ss_h;
4325 s->segmentation.enabled = ssrc->segmentation.enabled;
4326 s->segmentation.update_map = ssrc->segmentation.update_map;
4327 s->bytesperpixel = ssrc->bytesperpixel;
4329 s->bpp_index = ssrc->bpp_index;
4330 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4331 memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4332 if (ssrc->segmentation.enabled) {
4333 memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4334 sizeof(s->segmentation.feat));
4340 static const AVProfile profiles[] = {
4341 { FF_PROFILE_VP9_0, "Profile 0" },
4342 { FF_PROFILE_VP9_1, "Profile 1" },
4343 { FF_PROFILE_VP9_2, "Profile 2" },
4344 { FF_PROFILE_VP9_3, "Profile 3" },
4345 { FF_PROFILE_UNKNOWN },
4348 AVCodec ff_vp9_decoder = {
4350 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4351 .type = AVMEDIA_TYPE_VIDEO,
4352 .id = AV_CODEC_ID_VP9,
4353 .priv_data_size = sizeof(VP9Context),
4354 .init = vp9_decode_init,
4355 .close = vp9_decode_free,
4356 .decode = vp9_decode_frame,
4357 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4358 .flush = vp9_decode_flush,
4359 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4360 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4361 .profiles = NULL_IF_CONFIG_SMALL(profiles),