2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include "libavutil/avassert.h"
34 #include "libavutil/pixdesc.h"
36 #define VP9_SYNCCODE 0x498342
73 typedef struct VP9Frame {
75 AVBufferRef *extradata;
76 uint8_t *segmentation_map;
77 struct VP9mvrefPair *mv;
83 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
84 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
87 typedef struct VP9Block {
88 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
89 enum FilterMode filter;
90 VP56mv mv[4 /* b_idx */][2 /* ref */];
92 enum TxfmMode tx, uvtx;
94 enum BlockPartition bp;
97 typedef struct VP9Context {
104 VP9Block *b_base, *b;
106 int row, row7, col, col7;
108 ptrdiff_t y_stride, uv_stride;
111 uint8_t keyframe, last_keyframe;
112 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
114 uint8_t use_last_frame_mvs;
119 uint8_t refreshrefmask;
120 uint8_t highprecisionmvs;
121 enum FilterMode filtermode;
122 uint8_t allowcompinter;
125 uint8_t parallelmode;
129 uint8_t varcompref[2];
130 ThreadFrame refs[8], next_refs[8];
132 #define REF_FRAME_MVPAIR 1
133 #define REF_FRAME_SEGMAP 2
140 uint8_t mblim_lut[64];
148 int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
150 #define MAX_SEGMENT 8
154 uint8_t absolute_vals;
156 uint8_t ignore_refmap;
161 uint8_t skip_enabled;
170 unsigned log2_tile_cols, log2_tile_rows;
171 unsigned tile_cols, tile_rows;
172 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
174 unsigned sb_cols, sb_rows, rows, cols;
177 uint8_t coef[4][2][2][6][6][3];
181 uint8_t coef[4][2][2][6][6][11];
186 unsigned y_mode[4][10];
187 unsigned uv_mode[10][10];
188 unsigned filter[4][3];
189 unsigned mv_mode[7][4];
190 unsigned intra[4][2];
192 unsigned single_ref[5][2][2];
193 unsigned comp_ref[5][2];
194 unsigned tx32p[2][4];
195 unsigned tx16p[2][3];
198 unsigned mv_joint[4];
201 unsigned classes[11];
203 unsigned bits[10][2];
204 unsigned class0_fp[2][4];
206 unsigned class0_hp[2];
209 unsigned partition[4][4][4];
210 unsigned coef[4][2][2][6][6][3];
211 unsigned eob[4][2][2][6][6][2];
213 enum TxfmMode txfmmode;
214 enum CompPredMode comppredmode;
216 // contextual (left/above) cache
217 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
218 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
219 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
220 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
221 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
222 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
223 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
224 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
225 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
226 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
227 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
228 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
229 uint8_t *above_partition_ctx;
230 uint8_t *above_mode_ctx;
231 // FIXME maybe merge some of the below in a flags field?
232 uint8_t *above_y_nnz_ctx;
233 uint8_t *above_uv_nnz_ctx[2];
234 uint8_t *above_skip_ctx; // 1bit
235 uint8_t *above_txfm_ctx; // 2bit
236 uint8_t *above_segpred_ctx; // 1bit
237 uint8_t *above_intra_ctx; // 1bit
238 uint8_t *above_comp_ctx; // 1bit
239 uint8_t *above_ref_ctx; // 2bit
240 uint8_t *above_filter_ctx;
241 VP56mv (*above_mv_ctx)[2];
244 uint8_t *intra_pred_data[3];
245 struct VP9Filter *lflvl;
246 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
248 // block reconstruction intermediates
249 int block_alloc_using_2pass;
250 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
251 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
252 struct { int x, y; } min_mv, max_mv;
253 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
254 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
255 uint16_t mvscale[3][2];
256 uint8_t mvstep[3][2];
259 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
261 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
262 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
264 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
265 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
269 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
271 VP9Context *s = ctx->priv_data;
274 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
276 sz = 64 * s->sb_cols * s->sb_rows;
277 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
278 ff_thread_release_buffer(ctx, &f->tf);
279 return AVERROR(ENOMEM);
282 f->segmentation_map = f->extradata->data;
283 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
288 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
290 ff_thread_release_buffer(ctx, &f->tf);
291 av_buffer_unref(&f->extradata);
292 f->segmentation_map = NULL;
295 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
299 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
301 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
302 vp9_unref_frame(ctx, dst);
303 return AVERROR(ENOMEM);
306 dst->segmentation_map = src->segmentation_map;
308 dst->uses_2pass = src->uses_2pass;
313 static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
315 VP9Context *s = ctx->priv_data;
317 int bytesperpixel = s->bytesperpixel;
319 av_assert0(w > 0 && h > 0);
321 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
327 s->sb_cols = (w + 63) >> 6;
328 s->sb_rows = (h + 63) >> 6;
329 s->cols = (w + 7) >> 3;
330 s->rows = (h + 7) >> 3;
332 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
333 av_freep(&s->intra_pred_data[0]);
334 // FIXME we slightly over-allocate here for subsampled chroma, but a little
335 // bit of padding shouldn't affect performance...
336 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
337 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
339 return AVERROR(ENOMEM);
340 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
341 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
342 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
343 assign(s->above_y_nnz_ctx, uint8_t *, 16);
344 assign(s->above_mode_ctx, uint8_t *, 16);
345 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
346 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
347 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
348 assign(s->above_partition_ctx, uint8_t *, 8);
349 assign(s->above_skip_ctx, uint8_t *, 8);
350 assign(s->above_txfm_ctx, uint8_t *, 8);
351 assign(s->above_segpred_ctx, uint8_t *, 8);
352 assign(s->above_intra_ctx, uint8_t *, 8);
353 assign(s->above_comp_ctx, uint8_t *, 8);
354 assign(s->above_ref_ctx, uint8_t *, 8);
355 assign(s->above_filter_ctx, uint8_t *, 8);
356 assign(s->lflvl, struct VP9Filter *, 1);
359 // these will be re-allocated a little later
360 av_freep(&s->b_base);
361 av_freep(&s->block_base);
363 if (s->bpp != s->last_bpp) {
364 ff_vp9dsp_init(&s->dsp, s->bpp);
365 ff_videodsp_init(&s->vdsp, s->bpp);
366 s->last_bpp = s->bpp;
372 static int update_block_buffers(AVCodecContext *ctx)
374 VP9Context *s = ctx->priv_data;
375 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
377 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
381 av_free(s->block_base);
382 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
383 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
384 if (s->frames[CUR_FRAME].uses_2pass) {
385 int sbs = s->sb_cols * s->sb_rows;
387 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
388 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
389 16 * 16 + 2 * chroma_eobs) * sbs);
390 if (!s->b_base || !s->block_base)
391 return AVERROR(ENOMEM);
392 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
393 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
394 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
395 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
396 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
398 s->b_base = av_malloc(sizeof(VP9Block));
399 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
400 16 * 16 + 2 * chroma_eobs);
401 if (!s->b_base || !s->block_base)
402 return AVERROR(ENOMEM);
403 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
404 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
405 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
406 s->uveob_base[0] = s->eob_base + 16 * 16;
407 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
409 s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
414 // for some reason the sign bit is at the end, not the start, of a bit sequence
415 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
417 int v = get_bits(gb, n);
418 return get_bits1(gb) ? -v : v;
421 static av_always_inline int inv_recenter_nonneg(int v, int m)
423 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
426 // differential forward probability updates
427 static int update_prob(VP56RangeCoder *c, int p)
429 static const int inv_map_table[255] = {
430 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
431 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
432 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
433 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
434 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
435 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
436 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
437 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
438 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
439 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
440 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
441 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
442 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
443 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
444 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
445 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
446 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
447 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
452 /* This code is trying to do a differential probability update. For a
453 * current probability A in the range [1, 255], the difference to a new
454 * probability of any value can be expressed differentially as 1-A,255-A
455 * where some part of this (absolute range) exists both in positive as
456 * well as the negative part, whereas another part only exists in one
457 * half. We're trying to code this shared part differentially, i.e.
458 * times two where the value of the lowest bit specifies the sign, and
459 * the single part is then coded on top of this. This absolute difference
460 * then again has a value of [0,254], but a bigger value in this range
461 * indicates that we're further away from the original value A, so we
462 * can code this as a VLC code, since higher values are increasingly
463 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
464 * updates vs. the 'fine, exact' updates further down the range, which
465 * adds one extra dimension to this differential update model. */
467 if (!vp8_rac_get(c)) {
468 d = vp8_rac_get_uint(c, 4) + 0;
469 } else if (!vp8_rac_get(c)) {
470 d = vp8_rac_get_uint(c, 4) + 16;
471 } else if (!vp8_rac_get(c)) {
472 d = vp8_rac_get_uint(c, 5) + 32;
474 d = vp8_rac_get_uint(c, 7);
476 d = (d << 1) - 65 + vp8_rac_get(c);
478 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
481 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
482 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
485 static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
487 static const enum AVColorSpace colorspaces[8] = {
488 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
489 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
491 VP9Context *s = ctx->priv_data;
492 enum AVPixelFormat res;
493 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
496 s->bpp = 8 + bits * 2;
497 s->bytesperpixel = (7 + s->bpp) >> 3;
498 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
499 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
500 static const enum AVPixelFormat pix_fmt_rgb[3] = {
501 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
503 if (ctx->profile & 1) {
504 s->ss_h = s->ss_v = 0;
505 res = pix_fmt_rgb[bits];
506 ctx->color_range = AVCOL_RANGE_JPEG;
507 if (get_bits1(&s->gb)) {
508 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
509 return AVERROR_INVALIDDATA;
512 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
514 return AVERROR_INVALIDDATA;
517 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
518 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
519 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
520 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
521 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
522 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
523 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
525 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
526 if (ctx->profile & 1) {
527 s->ss_h = get_bits1(&s->gb);
528 s->ss_v = get_bits1(&s->gb);
529 if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
530 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
532 return AVERROR_INVALIDDATA;
533 } else if (get_bits1(&s->gb)) {
534 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
536 return AVERROR_INVALIDDATA;
539 s->ss_h = s->ss_v = 1;
540 res = pix_fmt_for_ss[bits][1][1];
547 static int decode_frame_header(AVCodecContext *ctx,
548 const uint8_t *data, int size, int *ref)
550 VP9Context *s = ctx->priv_data;
551 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
552 enum AVPixelFormat fmt = ctx->pix_fmt;
554 const uint8_t *data2;
557 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
558 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
561 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
562 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
563 return AVERROR_INVALIDDATA;
565 ctx->profile = get_bits1(&s->gb);
566 ctx->profile |= get_bits1(&s->gb) << 1;
567 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
568 if (ctx->profile > 3) {
569 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
570 return AVERROR_INVALIDDATA;
572 if (get_bits1(&s->gb)) {
573 *ref = get_bits(&s->gb, 3);
576 s->last_keyframe = s->keyframe;
577 s->keyframe = !get_bits1(&s->gb);
578 last_invisible = s->invisible;
579 s->invisible = !get_bits1(&s->gb);
580 s->errorres = get_bits1(&s->gb);
581 s->use_last_frame_mvs = !s->errorres && !last_invisible;
583 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
584 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
585 return AVERROR_INVALIDDATA;
587 if ((fmt = read_colorspace_details(ctx)) < 0)
589 // for profile 1, here follows the subsampling bits
590 s->refreshrefmask = 0xff;
591 w = get_bits(&s->gb, 16) + 1;
592 h = get_bits(&s->gb, 16) + 1;
593 if (get_bits1(&s->gb)) // display size
594 skip_bits(&s->gb, 32);
596 s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
597 s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
599 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
600 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
601 return AVERROR_INVALIDDATA;
603 if (ctx->profile >= 1) {
604 if ((fmt = read_colorspace_details(ctx)) < 0)
607 s->ss_h = s->ss_v = 1;
610 s->bytesperpixel = 1;
611 fmt = AV_PIX_FMT_YUV420P;
612 ctx->colorspace = AVCOL_SPC_BT470BG;
613 ctx->color_range = AVCOL_RANGE_JPEG;
615 s->refreshrefmask = get_bits(&s->gb, 8);
616 w = get_bits(&s->gb, 16) + 1;
617 h = get_bits(&s->gb, 16) + 1;
618 if (get_bits1(&s->gb)) // display size
619 skip_bits(&s->gb, 32);
621 s->refreshrefmask = get_bits(&s->gb, 8);
622 s->refidx[0] = get_bits(&s->gb, 3);
623 s->signbias[0] = get_bits1(&s->gb) && !s->errorres;
624 s->refidx[1] = get_bits(&s->gb, 3);
625 s->signbias[1] = get_bits1(&s->gb) && !s->errorres;
626 s->refidx[2] = get_bits(&s->gb, 3);
627 s->signbias[2] = get_bits1(&s->gb) && !s->errorres;
628 if (!s->refs[s->refidx[0]].f->data[0] ||
629 !s->refs[s->refidx[1]].f->data[0] ||
630 !s->refs[s->refidx[2]].f->data[0]) {
631 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
632 return AVERROR_INVALIDDATA;
634 if (get_bits1(&s->gb)) {
635 w = s->refs[s->refidx[0]].f->width;
636 h = s->refs[s->refidx[0]].f->height;
637 } else if (get_bits1(&s->gb)) {
638 w = s->refs[s->refidx[1]].f->width;
639 h = s->refs[s->refidx[1]].f->height;
640 } else if (get_bits1(&s->gb)) {
641 w = s->refs[s->refidx[2]].f->width;
642 h = s->refs[s->refidx[2]].f->height;
644 w = get_bits(&s->gb, 16) + 1;
645 h = get_bits(&s->gb, 16) + 1;
647 // Note that in this code, "CUR_FRAME" is actually before we
648 // have formally allocated a frame, and thus actually represents
650 s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
651 s->frames[CUR_FRAME].tf.f->height == h;
652 if (get_bits1(&s->gb)) // display size
653 skip_bits(&s->gb, 32);
654 s->highprecisionmvs = get_bits1(&s->gb);
655 s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
657 s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
658 s->signbias[0] != s->signbias[2]);
659 if (s->allowcompinter) {
660 if (s->signbias[0] == s->signbias[1]) {
662 s->varcompref[0] = 0;
663 s->varcompref[1] = 1;
664 } else if (s->signbias[0] == s->signbias[2]) {
666 s->varcompref[0] = 0;
667 s->varcompref[1] = 2;
670 s->varcompref[0] = 1;
671 s->varcompref[1] = 2;
675 for (i = 0; i < 3; i++) {
676 AVFrame *ref = s->refs[s->refidx[i]].f;
677 int refw = ref->width, refh = ref->height;
679 if (ref->format != fmt) {
680 av_log(ctx, AV_LOG_ERROR,
681 "Ref pixfmt (%s) did not match current frame (%s)",
682 av_get_pix_fmt_name(ref->format),
683 av_get_pix_fmt_name(fmt));
684 return AVERROR_INVALIDDATA;
685 } else if (refw == w && refh == h) {
686 s->mvscale[i][0] = s->mvscale[i][1] = 0;
688 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
689 av_log(ctx, AV_LOG_ERROR,
690 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
692 return AVERROR_INVALIDDATA;
694 s->mvscale[i][0] = (refw << 14) / w;
695 s->mvscale[i][1] = (refh << 14) / h;
696 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
697 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
702 s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
703 s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
704 s->framectxid = c = get_bits(&s->gb, 2);
706 /* loopfilter header data */
707 if (s->keyframe || s->errorres || s->intraonly) {
708 // reset loopfilter defaults
709 s->lf_delta.ref[0] = 1;
710 s->lf_delta.ref[1] = 0;
711 s->lf_delta.ref[2] = -1;
712 s->lf_delta.ref[3] = -1;
713 s->lf_delta.mode[0] = 0;
714 s->lf_delta.mode[1] = 0;
715 memset(s->segmentation.feat, 0, sizeof(s->segmentation.feat));
717 s->filter.level = get_bits(&s->gb, 6);
718 sharp = get_bits(&s->gb, 3);
719 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
720 // the old cache values since they are still valid
721 if (s->filter.sharpness != sharp)
722 memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
723 s->filter.sharpness = sharp;
724 if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
725 if (get_bits1(&s->gb)) {
726 for (i = 0; i < 4; i++)
727 if (get_bits1(&s->gb))
728 s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
729 for (i = 0; i < 2; i++)
730 if (get_bits1(&s->gb))
731 s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
735 /* quantization header data */
736 s->yac_qi = get_bits(&s->gb, 8);
737 s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
738 s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
739 s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
740 s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
741 s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
743 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
745 /* segmentation header info */
746 s->segmentation.ignore_refmap = 0;
747 if ((s->segmentation.enabled = get_bits1(&s->gb))) {
748 if ((s->segmentation.update_map = get_bits1(&s->gb))) {
749 for (i = 0; i < 7; i++)
750 s->prob.seg[i] = get_bits1(&s->gb) ?
751 get_bits(&s->gb, 8) : 255;
752 if ((s->segmentation.temporal = get_bits1(&s->gb))) {
753 for (i = 0; i < 3; i++)
754 s->prob.segpred[i] = get_bits1(&s->gb) ?
755 get_bits(&s->gb, 8) : 255;
758 if ((!s->segmentation.update_map || s->segmentation.temporal) &&
759 (w != s->frames[CUR_FRAME].tf.f->width ||
760 h != s->frames[CUR_FRAME].tf.f->height)) {
761 av_log(ctx, AV_LOG_WARNING,
762 "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
763 s->segmentation.temporal, s->segmentation.update_map);
764 s->segmentation.ignore_refmap = 1;
765 //return AVERROR_INVALIDDATA;
768 if (get_bits1(&s->gb)) {
769 s->segmentation.absolute_vals = get_bits1(&s->gb);
770 for (i = 0; i < 8; i++) {
771 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
772 s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
773 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
774 s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
775 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
776 s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
777 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
782 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
783 for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
784 int qyac, qydc, quvac, quvdc, lflvl, sh;
786 if (s->segmentation.enabled && s->segmentation.feat[i].q_enabled) {
787 if (s->segmentation.absolute_vals)
788 qyac = av_clip_uintp2(s->segmentation.feat[i].q_val, 8);
790 qyac = av_clip_uintp2(s->yac_qi + s->segmentation.feat[i].q_val, 8);
794 qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
795 quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
796 quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
797 qyac = av_clip_uintp2(qyac, 8);
799 s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
800 s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
801 s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
802 s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
804 sh = s->filter.level >= 32;
805 if (s->segmentation.enabled && s->segmentation.feat[i].lf_enabled) {
806 if (s->segmentation.absolute_vals)
807 lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
809 lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
811 lflvl = s->filter.level;
813 if (s->lf_delta.enabled) {
814 s->segmentation.feat[i].lflvl[0][0] =
815 s->segmentation.feat[i].lflvl[0][1] =
816 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
817 for (j = 1; j < 4; j++) {
818 s->segmentation.feat[i].lflvl[j][0] =
819 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
820 s->lf_delta.mode[0]) * (1 << sh)), 6);
821 s->segmentation.feat[i].lflvl[j][1] =
822 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
823 s->lf_delta.mode[1]) * (1 << sh)), 6);
826 memset(s->segmentation.feat[i].lflvl, lflvl,
827 sizeof(s->segmentation.feat[i].lflvl));
832 if ((res = update_size(ctx, w, h, fmt)) < 0) {
833 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
836 for (s->tiling.log2_tile_cols = 0;
837 (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
838 s->tiling.log2_tile_cols++) ;
839 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
840 max = FFMAX(0, max - 1);
841 while (max > s->tiling.log2_tile_cols) {
842 if (get_bits1(&s->gb))
843 s->tiling.log2_tile_cols++;
847 s->tiling.log2_tile_rows = decode012(&s->gb);
848 s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
849 if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
850 s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
851 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
852 sizeof(VP56RangeCoder) * s->tiling.tile_cols);
854 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
855 return AVERROR(ENOMEM);
859 if (s->keyframe || s->errorres || (s->intraonly && s->resetctx == 3)) {
860 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
861 s->prob_ctx[3].p = vp9_default_probs;
862 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
863 sizeof(vp9_default_coef_probs));
864 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
865 sizeof(vp9_default_coef_probs));
866 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
867 sizeof(vp9_default_coef_probs));
868 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
869 sizeof(vp9_default_coef_probs));
870 } else if (s->intraonly && s->resetctx == 2) {
871 s->prob_ctx[c].p = vp9_default_probs;
872 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
873 sizeof(vp9_default_coef_probs));
876 // next 16 bits is size of the rest of the header (arith-coded)
877 size2 = get_bits(&s->gb, 16);
878 data2 = align_get_bits(&s->gb);
879 if (size2 > size - (data2 - data)) {
880 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
881 return AVERROR_INVALIDDATA;
883 ff_vp56_init_range_decoder(&s->c, data2, size2);
884 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
885 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
886 return AVERROR_INVALIDDATA;
889 if (s->keyframe || s->intraonly) {
890 memset(s->counts.coef, 0, sizeof(s->counts.coef));
891 memset(s->counts.eob, 0, sizeof(s->counts.eob));
893 memset(&s->counts, 0, sizeof(s->counts));
895 // FIXME is it faster to not copy here, but do it down in the fw updates
896 // as explicit copies if the fw update is missing (and skip the copy upon
898 s->prob.p = s->prob_ctx[c].p;
902 s->txfmmode = TX_4X4;
904 s->txfmmode = vp8_rac_get_uint(&s->c, 2);
905 if (s->txfmmode == 3)
906 s->txfmmode += vp8_rac_get(&s->c);
908 if (s->txfmmode == TX_SWITCHABLE) {
909 for (i = 0; i < 2; i++)
910 if (vp56_rac_get_prob_branchy(&s->c, 252))
911 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
912 for (i = 0; i < 2; i++)
913 for (j = 0; j < 2; j++)
914 if (vp56_rac_get_prob_branchy(&s->c, 252))
915 s->prob.p.tx16p[i][j] =
916 update_prob(&s->c, s->prob.p.tx16p[i][j]);
917 for (i = 0; i < 2; i++)
918 for (j = 0; j < 3; j++)
919 if (vp56_rac_get_prob_branchy(&s->c, 252))
920 s->prob.p.tx32p[i][j] =
921 update_prob(&s->c, s->prob.p.tx32p[i][j]);
926 for (i = 0; i < 4; i++) {
927 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
928 if (vp8_rac_get(&s->c)) {
929 for (j = 0; j < 2; j++)
930 for (k = 0; k < 2; k++)
931 for (l = 0; l < 6; l++)
932 for (m = 0; m < 6; m++) {
933 uint8_t *p = s->prob.coef[i][j][k][l][m];
934 uint8_t *r = ref[j][k][l][m];
935 if (m >= 3 && l == 0) // dc only has 3 pt
937 for (n = 0; n < 3; n++) {
938 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
939 p[n] = update_prob(&s->c, r[n]);
947 for (j = 0; j < 2; j++)
948 for (k = 0; k < 2; k++)
949 for (l = 0; l < 6; l++)
950 for (m = 0; m < 6; m++) {
951 uint8_t *p = s->prob.coef[i][j][k][l][m];
952 uint8_t *r = ref[j][k][l][m];
953 if (m > 3 && l == 0) // dc only has 3 pt
959 if (s->txfmmode == i)
964 for (i = 0; i < 3; i++)
965 if (vp56_rac_get_prob_branchy(&s->c, 252))
966 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
967 if (!s->keyframe && !s->intraonly) {
968 for (i = 0; i < 7; i++)
969 for (j = 0; j < 3; j++)
970 if (vp56_rac_get_prob_branchy(&s->c, 252))
971 s->prob.p.mv_mode[i][j] =
972 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
974 if (s->filtermode == FILTER_SWITCHABLE)
975 for (i = 0; i < 4; i++)
976 for (j = 0; j < 2; j++)
977 if (vp56_rac_get_prob_branchy(&s->c, 252))
978 s->prob.p.filter[i][j] =
979 update_prob(&s->c, s->prob.p.filter[i][j]);
981 for (i = 0; i < 4; i++)
982 if (vp56_rac_get_prob_branchy(&s->c, 252))
983 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
985 if (s->allowcompinter) {
986 s->comppredmode = vp8_rac_get(&s->c);
988 s->comppredmode += vp8_rac_get(&s->c);
989 if (s->comppredmode == PRED_SWITCHABLE)
990 for (i = 0; i < 5; i++)
991 if (vp56_rac_get_prob_branchy(&s->c, 252))
993 update_prob(&s->c, s->prob.p.comp[i]);
995 s->comppredmode = PRED_SINGLEREF;
998 if (s->comppredmode != PRED_COMPREF) {
999 for (i = 0; i < 5; i++) {
1000 if (vp56_rac_get_prob_branchy(&s->c, 252))
1001 s->prob.p.single_ref[i][0] =
1002 update_prob(&s->c, s->prob.p.single_ref[i][0]);
1003 if (vp56_rac_get_prob_branchy(&s->c, 252))
1004 s->prob.p.single_ref[i][1] =
1005 update_prob(&s->c, s->prob.p.single_ref[i][1]);
1009 if (s->comppredmode != PRED_SINGLEREF) {
1010 for (i = 0; i < 5; i++)
1011 if (vp56_rac_get_prob_branchy(&s->c, 252))
1012 s->prob.p.comp_ref[i] =
1013 update_prob(&s->c, s->prob.p.comp_ref[i]);
1016 for (i = 0; i < 4; i++)
1017 for (j = 0; j < 9; j++)
1018 if (vp56_rac_get_prob_branchy(&s->c, 252))
1019 s->prob.p.y_mode[i][j] =
1020 update_prob(&s->c, s->prob.p.y_mode[i][j]);
1022 for (i = 0; i < 4; i++)
1023 for (j = 0; j < 4; j++)
1024 for (k = 0; k < 3; k++)
1025 if (vp56_rac_get_prob_branchy(&s->c, 252))
1026 s->prob.p.partition[3 - i][j][k] =
1027 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1029 // mv fields don't use the update_prob subexp model for some reason
1030 for (i = 0; i < 3; i++)
1031 if (vp56_rac_get_prob_branchy(&s->c, 252))
1032 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1034 for (i = 0; i < 2; i++) {
1035 if (vp56_rac_get_prob_branchy(&s->c, 252))
1036 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1038 for (j = 0; j < 10; j++)
1039 if (vp56_rac_get_prob_branchy(&s->c, 252))
1040 s->prob.p.mv_comp[i].classes[j] =
1041 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1043 if (vp56_rac_get_prob_branchy(&s->c, 252))
1044 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1046 for (j = 0; j < 10; j++)
1047 if (vp56_rac_get_prob_branchy(&s->c, 252))
1048 s->prob.p.mv_comp[i].bits[j] =
1049 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1052 for (i = 0; i < 2; i++) {
1053 for (j = 0; j < 2; j++)
1054 for (k = 0; k < 3; k++)
1055 if (vp56_rac_get_prob_branchy(&s->c, 252))
1056 s->prob.p.mv_comp[i].class0_fp[j][k] =
1057 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1059 for (j = 0; j < 3; j++)
1060 if (vp56_rac_get_prob_branchy(&s->c, 252))
1061 s->prob.p.mv_comp[i].fp[j] =
1062 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1065 if (s->highprecisionmvs) {
1066 for (i = 0; i < 2; i++) {
1067 if (vp56_rac_get_prob_branchy(&s->c, 252))
1068 s->prob.p.mv_comp[i].class0_hp =
1069 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1071 if (vp56_rac_get_prob_branchy(&s->c, 252))
1072 s->prob.p.mv_comp[i].hp =
1073 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1078 return (data2 - data) + size2;
1081 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1084 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1085 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1088 static void find_ref_mvs(VP9Context *s,
1089 VP56mv *pmv, int ref, int z, int idx, int sb)
1091 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1092 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1093 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1094 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1095 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1096 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1097 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1098 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1099 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1100 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1101 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1102 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1103 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1104 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1105 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1106 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1107 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1108 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1109 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1110 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1111 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1112 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1113 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1114 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1115 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1116 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1117 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1120 int row = s->row, col = s->col, row7 = s->row7;
1121 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1122 #define INVALID_MV 0x80008000U
1123 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1126 #define RETURN_DIRECT_MV(mv) \
1128 uint32_t m = AV_RN32A(&mv); \
1132 } else if (mem == INVALID_MV) { \
1134 } else if (m != mem) { \
1141 if (sb == 2 || sb == 1) {
1142 RETURN_DIRECT_MV(b->mv[0][z]);
1143 } else if (sb == 3) {
1144 RETURN_DIRECT_MV(b->mv[2][z]);
1145 RETURN_DIRECT_MV(b->mv[1][z]);
1146 RETURN_DIRECT_MV(b->mv[0][z]);
1149 #define RETURN_MV(mv) \
1154 av_assert2(idx == 1); \
1155 av_assert2(mem != INVALID_MV); \
1156 if (mem_sub8x8 == INVALID_MV) { \
1157 clamp_mv(&tmp, &mv, s); \
1158 m = AV_RN32A(&tmp); \
1163 mem_sub8x8 = AV_RN32A(&mv); \
1164 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1165 clamp_mv(&tmp, &mv, s); \
1166 m = AV_RN32A(&tmp); \
1170 /* BUG I'm pretty sure this isn't the intention */ \
1176 uint32_t m = AV_RN32A(&mv); \
1178 clamp_mv(pmv, &mv, s); \
1180 } else if (mem == INVALID_MV) { \
1182 } else if (m != mem) { \
1183 clamp_mv(pmv, &mv, s); \
1190 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1191 if (mv->ref[0] == ref) {
1192 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1193 } else if (mv->ref[1] == ref) {
1194 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1197 if (col > s->tiling.tile_col_start) {
1198 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1199 if (mv->ref[0] == ref) {
1200 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1201 } else if (mv->ref[1] == ref) {
1202 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1210 // previously coded MVs in this neighbourhood, using same reference frame
1211 for (; i < 8; i++) {
1212 int c = p[i][0] + col, r = p[i][1] + row;
1214 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1215 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1217 if (mv->ref[0] == ref) {
1218 RETURN_MV(mv->mv[0]);
1219 } else if (mv->ref[1] == ref) {
1220 RETURN_MV(mv->mv[1]);
1225 // MV at this position in previous frame, using same reference frame
1226 if (s->use_last_frame_mvs) {
1227 struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1229 if (!s->frames[REF_FRAME_MVPAIR].uses_2pass)
1230 ff_thread_await_progress(&s->frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1231 if (mv->ref[0] == ref) {
1232 RETURN_MV(mv->mv[0]);
1233 } else if (mv->ref[1] == ref) {
1234 RETURN_MV(mv->mv[1]);
1238 #define RETURN_SCALE_MV(mv, scale) \
1241 VP56mv mv_temp = { -mv.x, -mv.y }; \
1242 RETURN_MV(mv_temp); \
1248 // previously coded MVs in this neighbourhood, using different reference frame
1249 for (i = 0; i < 8; i++) {
1250 int c = p[i][0] + col, r = p[i][1] + row;
1252 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1253 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1255 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1256 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1258 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1259 // BUG - libvpx has this condition regardless of whether
1260 // we used the first ref MV and pre-scaling
1261 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1262 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1267 // MV at this position in previous frame, using different reference frame
1268 if (s->use_last_frame_mvs) {
1269 struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1271 // no need to await_progress, because we already did that above
1272 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1273 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1275 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1276 // BUG - libvpx has this condition regardless of whether
1277 // we used the first ref MV and pre-scaling
1278 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1279 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1284 clamp_mv(pmv, pmv, s);
1287 #undef RETURN_SCALE_MV
1290 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1292 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1293 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1294 s->prob.p.mv_comp[idx].classes);
1296 s->counts.mv_comp[idx].sign[sign]++;
1297 s->counts.mv_comp[idx].classes[c]++;
1301 for (n = 0, m = 0; m < c; m++) {
1302 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1304 s->counts.mv_comp[idx].bits[m][bit]++;
1307 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1309 s->counts.mv_comp[idx].fp[bit]++;
1311 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1312 s->counts.mv_comp[idx].hp[bit]++;
1316 // bug in libvpx - we count for bw entropy purposes even if the
1318 s->counts.mv_comp[idx].hp[1]++;
1322 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1323 s->counts.mv_comp[idx].class0[n]++;
1324 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1325 s->prob.p.mv_comp[idx].class0_fp[n]);
1326 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1327 n = (n << 3) | (bit << 1);
1329 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1330 s->counts.mv_comp[idx].class0_hp[bit]++;
1334 // bug in libvpx - we count for bw entropy purposes even if the
1336 s->counts.mv_comp[idx].class0_hp[1]++;
1340 return sign ? -(n + 1) : (n + 1);
1343 static void fill_mv(VP9Context *s,
1344 VP56mv *mv, int mode, int sb)
1348 if (mode == ZEROMV) {
1353 // FIXME cache this value and reuse for other subblocks
1354 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1355 mode == NEWMV ? -1 : sb);
1356 // FIXME maybe move this code into find_ref_mvs()
1357 if ((mode == NEWMV || sb == -1) &&
1358 !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1372 if (mode == NEWMV) {
1373 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1374 s->prob.p.mv_joint);
1376 s->counts.mv_joint[j]++;
1377 if (j >= MV_JOINT_V)
1378 mv[0].y += read_mv_component(s, 0, hp);
1380 mv[0].x += read_mv_component(s, 1, hp);
1384 // FIXME cache this value and reuse for other subblocks
1385 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1386 mode == NEWMV ? -1 : sb);
1387 if ((mode == NEWMV || sb == -1) &&
1388 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1402 if (mode == NEWMV) {
1403 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1404 s->prob.p.mv_joint);
1406 s->counts.mv_joint[j]++;
1407 if (j >= MV_JOINT_V)
1408 mv[1].y += read_mv_component(s, 0, hp);
1410 mv[1].x += read_mv_component(s, 1, hp);
1416 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1417 ptrdiff_t stride, int v)
1427 int v16 = v * 0x0101;
1435 uint32_t v32 = v * 0x01010101;
1444 uint64_t v64 = v * 0x0101010101010101ULL;
1450 uint32_t v32 = v * 0x01010101;
1453 AV_WN32A(ptr + 4, v32);
1462 static void decode_mode(AVCodecContext *ctx)
1464 static const uint8_t left_ctx[N_BS_SIZES] = {
1465 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1467 static const uint8_t above_ctx[N_BS_SIZES] = {
1468 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1470 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1471 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1472 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1474 VP9Context *s = ctx->priv_data;
1476 int row = s->row, col = s->col, row7 = s->row7;
1477 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1478 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1479 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1480 int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1481 int vref, filter_id;
1483 if (!s->segmentation.enabled) {
1485 } else if (s->keyframe || s->intraonly) {
1486 b->seg_id = !s->segmentation.update_map ? 0 :
1487 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1488 } else if (!s->segmentation.update_map ||
1489 (s->segmentation.temporal &&
1490 vp56_rac_get_prob_branchy(&s->c,
1491 s->prob.segpred[s->above_segpred_ctx[col] +
1492 s->left_segpred_ctx[row7]]))) {
1493 if (!s->errorres && !s->segmentation.ignore_refmap) {
1495 uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
1497 if (!s->frames[REF_FRAME_SEGMAP].uses_2pass)
1498 ff_thread_await_progress(&s->frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1499 for (y = 0; y < h4; y++) {
1500 int idx_base = (y + row) * 8 * s->sb_cols + col;
1501 for (x = 0; x < w4; x++)
1502 pred = FFMIN(pred, refsegmap[idx_base + x]);
1504 av_assert1(pred < 8);
1510 memset(&s->above_segpred_ctx[col], 1, w4);
1511 memset(&s->left_segpred_ctx[row7], 1, h4);
1513 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1516 memset(&s->above_segpred_ctx[col], 0, w4);
1517 memset(&s->left_segpred_ctx[row7], 0, h4);
1519 if (s->segmentation.enabled &&
1520 (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1521 setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1522 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1525 b->skip = s->segmentation.enabled &&
1526 s->segmentation.feat[b->seg_id].skip_enabled;
1528 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1529 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1530 s->counts.skip[c][b->skip]++;
1533 if (s->keyframe || s->intraonly) {
1535 } else if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
1536 b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1540 if (have_a && have_l) {
1541 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1544 c = have_a ? 2 * s->above_intra_ctx[col] :
1545 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1547 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1548 s->counts.intra[c][bit]++;
1552 if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1556 c = (s->above_skip_ctx[col] ? max_tx :
1557 s->above_txfm_ctx[col]) +
1558 (s->left_skip_ctx[row7] ? max_tx :
1559 s->left_txfm_ctx[row7]) > max_tx;
1561 c = s->above_skip_ctx[col] ? 1 :
1562 (s->above_txfm_ctx[col] * 2 > max_tx);
1564 } else if (have_l) {
1565 c = s->left_skip_ctx[row7] ? 1 :
1566 (s->left_txfm_ctx[row7] * 2 > max_tx);
1572 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1574 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1576 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1578 s->counts.tx32p[c][b->tx]++;
1581 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1583 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1584 s->counts.tx16p[c][b->tx]++;
1587 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1588 s->counts.tx8p[c][b->tx]++;
1595 b->tx = FFMIN(max_tx, s->txfmmode);
1598 if (s->keyframe || s->intraonly) {
1599 uint8_t *a = &s->above_mode_ctx[col * 2];
1600 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1603 if (b->bs > BS_8x8) {
1604 // FIXME the memory storage intermediates here aren't really
1605 // necessary, they're just there to make the code slightly
1607 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1608 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1609 if (b->bs != BS_8x4) {
1610 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1611 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1612 l[0] = a[1] = b->mode[1];
1614 l[0] = a[1] = b->mode[1] = b->mode[0];
1616 if (b->bs != BS_4x8) {
1617 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1618 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1619 if (b->bs != BS_8x4) {
1620 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1621 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1622 l[1] = a[1] = b->mode[3];
1624 l[1] = a[1] = b->mode[3] = b->mode[2];
1627 b->mode[2] = b->mode[0];
1628 l[1] = a[1] = b->mode[3] = b->mode[1];
1631 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1632 vp9_default_kf_ymode_probs[*a][*l]);
1633 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1634 // FIXME this can probably be optimized
1635 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1636 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1638 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1639 vp9_default_kf_uvmode_probs[b->mode[3]]);
1640 } else if (b->intra) {
1642 if (b->bs > BS_8x8) {
1643 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1644 s->prob.p.y_mode[0]);
1645 s->counts.y_mode[0][b->mode[0]]++;
1646 if (b->bs != BS_8x4) {
1647 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1648 s->prob.p.y_mode[0]);
1649 s->counts.y_mode[0][b->mode[1]]++;
1651 b->mode[1] = b->mode[0];
1653 if (b->bs != BS_4x8) {
1654 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1655 s->prob.p.y_mode[0]);
1656 s->counts.y_mode[0][b->mode[2]]++;
1657 if (b->bs != BS_8x4) {
1658 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1659 s->prob.p.y_mode[0]);
1660 s->counts.y_mode[0][b->mode[3]]++;
1662 b->mode[3] = b->mode[2];
1665 b->mode[2] = b->mode[0];
1666 b->mode[3] = b->mode[1];
1669 static const uint8_t size_group[10] = {
1670 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1672 int sz = size_group[b->bs];
1674 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1675 s->prob.p.y_mode[sz]);
1676 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1677 s->counts.y_mode[sz][b->mode[3]]++;
1679 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1680 s->prob.p.uv_mode[b->mode[3]]);
1681 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1683 static const uint8_t inter_mode_ctx_lut[14][14] = {
1684 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1685 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1686 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1687 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1688 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1689 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1690 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1691 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1692 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1693 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1694 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1695 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1696 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1697 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1700 if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
1701 av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1703 b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1705 // read comp_pred flag
1706 if (s->comppredmode != PRED_SWITCHABLE) {
1707 b->comp = s->comppredmode == PRED_COMPREF;
1711 // FIXME add intra as ref=0xff (or -1) to make these easier?
1714 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1716 } else if (s->above_comp_ctx[col]) {
1717 c = 2 + (s->left_intra_ctx[row7] ||
1718 s->left_ref_ctx[row7] == s->fixcompref);
1719 } else if (s->left_comp_ctx[row7]) {
1720 c = 2 + (s->above_intra_ctx[col] ||
1721 s->above_ref_ctx[col] == s->fixcompref);
1723 c = (!s->above_intra_ctx[col] &&
1724 s->above_ref_ctx[col] == s->fixcompref) ^
1725 (!s->left_intra_ctx[row7] &&
1726 s->left_ref_ctx[row & 7] == s->fixcompref);
1729 c = s->above_comp_ctx[col] ? 3 :
1730 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1732 } else if (have_l) {
1733 c = s->left_comp_ctx[row7] ? 3 :
1734 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1738 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1739 s->counts.comp[c][b->comp]++;
1742 // read actual references
1743 // FIXME probably cache a few variables here to prevent repetitive
1744 // memory accesses below
1745 if (b->comp) /* two references */ {
1746 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1748 b->ref[fix_idx] = s->fixcompref;
1749 // FIXME can this codeblob be replaced by some sort of LUT?
1752 if (s->above_intra_ctx[col]) {
1753 if (s->left_intra_ctx[row7]) {
1756 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1758 } else if (s->left_intra_ctx[row7]) {
1759 c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1761 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1763 if (refl == refa && refa == s->varcompref[1]) {
1765 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1766 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1767 (refl == s->fixcompref && refa == s->varcompref[0])) {
1770 c = (refa == refl) ? 3 : 1;
1772 } else if (!s->left_comp_ctx[row7]) {
1773 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1776 c = (refl == s->varcompref[1] &&
1777 refa != s->varcompref[1]) ? 2 : 4;
1779 } else if (!s->above_comp_ctx[col]) {
1780 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1783 c = (refa == s->varcompref[1] &&
1784 refl != s->varcompref[1]) ? 2 : 4;
1787 c = (refl == refa) ? 4 : 2;
1791 if (s->above_intra_ctx[col]) {
1793 } else if (s->above_comp_ctx[col]) {
1794 c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1796 c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1799 } else if (have_l) {
1800 if (s->left_intra_ctx[row7]) {
1802 } else if (s->left_comp_ctx[row7]) {
1803 c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1805 c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1810 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1811 b->ref[var_idx] = s->varcompref[bit];
1812 s->counts.comp_ref[c][bit]++;
1813 } else /* single reference */ {
1816 if (have_a && !s->above_intra_ctx[col]) {
1817 if (have_l && !s->left_intra_ctx[row7]) {
1818 if (s->left_comp_ctx[row7]) {
1819 if (s->above_comp_ctx[col]) {
1820 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1821 !s->above_ref_ctx[col]);
1823 c = (3 * !s->above_ref_ctx[col]) +
1824 (!s->fixcompref || !s->left_ref_ctx[row7]);
1826 } else if (s->above_comp_ctx[col]) {
1827 c = (3 * !s->left_ref_ctx[row7]) +
1828 (!s->fixcompref || !s->above_ref_ctx[col]);
1830 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1832 } else if (s->above_intra_ctx[col]) {
1834 } else if (s->above_comp_ctx[col]) {
1835 c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1837 c = 4 * (!s->above_ref_ctx[col]);
1839 } else if (have_l && !s->left_intra_ctx[row7]) {
1840 if (s->left_intra_ctx[row7]) {
1842 } else if (s->left_comp_ctx[row7]) {
1843 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1845 c = 4 * (!s->left_ref_ctx[row7]);
1850 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1851 s->counts.single_ref[c][0][bit]++;
1855 // FIXME can this codeblob be replaced by some sort of LUT?
1858 if (s->left_intra_ctx[row7]) {
1859 if (s->above_intra_ctx[col]) {
1861 } else if (s->above_comp_ctx[col]) {
1862 c = 1 + 2 * (s->fixcompref == 1 ||
1863 s->above_ref_ctx[col] == 1);
1864 } else if (!s->above_ref_ctx[col]) {
1867 c = 4 * (s->above_ref_ctx[col] == 1);
1869 } else if (s->above_intra_ctx[col]) {
1870 if (s->left_intra_ctx[row7]) {
1872 } else if (s->left_comp_ctx[row7]) {
1873 c = 1 + 2 * (s->fixcompref == 1 ||
1874 s->left_ref_ctx[row7] == 1);
1875 } else if (!s->left_ref_ctx[row7]) {
1878 c = 4 * (s->left_ref_ctx[row7] == 1);
1880 } else if (s->above_comp_ctx[col]) {
1881 if (s->left_comp_ctx[row7]) {
1882 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1883 c = 3 * (s->fixcompref == 1 ||
1884 s->left_ref_ctx[row7] == 1);
1888 } else if (!s->left_ref_ctx[row7]) {
1889 c = 1 + 2 * (s->fixcompref == 1 ||
1890 s->above_ref_ctx[col] == 1);
1892 c = 3 * (s->left_ref_ctx[row7] == 1) +
1893 (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1895 } else if (s->left_comp_ctx[row7]) {
1896 if (!s->above_ref_ctx[col]) {
1897 c = 1 + 2 * (s->fixcompref == 1 ||
1898 s->left_ref_ctx[row7] == 1);
1900 c = 3 * (s->above_ref_ctx[col] == 1) +
1901 (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1903 } else if (!s->above_ref_ctx[col]) {
1904 if (!s->left_ref_ctx[row7]) {
1907 c = 4 * (s->left_ref_ctx[row7] == 1);
1909 } else if (!s->left_ref_ctx[row7]) {
1910 c = 4 * (s->above_ref_ctx[col] == 1);
1912 c = 2 * (s->left_ref_ctx[row7] == 1) +
1913 2 * (s->above_ref_ctx[col] == 1);
1916 if (s->above_intra_ctx[col] ||
1917 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1919 } else if (s->above_comp_ctx[col]) {
1920 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1922 c = 4 * (s->above_ref_ctx[col] == 1);
1925 } else if (have_l) {
1926 if (s->left_intra_ctx[row7] ||
1927 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1929 } else if (s->left_comp_ctx[row7]) {
1930 c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1932 c = 4 * (s->left_ref_ctx[row7] == 1);
1937 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1938 s->counts.single_ref[c][1][bit]++;
1939 b->ref[0] = 1 + bit;
1944 if (b->bs <= BS_8x8) {
1945 if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].skip_enabled) {
1946 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1948 static const uint8_t off[10] = {
1949 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1952 // FIXME this needs to use the LUT tables from find_ref_mvs
1953 // because not all are -1,0/0,-1
1954 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1955 [s->left_mode_ctx[row7 + off[b->bs]]];
1957 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1958 s->prob.p.mv_mode[c]);
1959 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1960 s->counts.mv_mode[c][b->mode[0] - 10]++;
1964 if (s->filtermode == FILTER_SWITCHABLE) {
1967 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1968 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1969 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1970 s->left_filter_ctx[row7] : 3;
1972 c = s->above_filter_ctx[col];
1974 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1975 c = s->left_filter_ctx[row7];
1980 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1981 s->prob.p.filter[c]);
1982 s->counts.filter[c][filter_id]++;
1983 b->filter = vp9_filter_lut[filter_id];
1985 b->filter = s->filtermode;
1988 if (b->bs > BS_8x8) {
1989 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1991 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1992 s->prob.p.mv_mode[c]);
1993 s->counts.mv_mode[c][b->mode[0] - 10]++;
1994 fill_mv(s, b->mv[0], b->mode[0], 0);
1996 if (b->bs != BS_8x4) {
1997 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1998 s->prob.p.mv_mode[c]);
1999 s->counts.mv_mode[c][b->mode[1] - 10]++;
2000 fill_mv(s, b->mv[1], b->mode[1], 1);
2002 b->mode[1] = b->mode[0];
2003 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2004 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2007 if (b->bs != BS_4x8) {
2008 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2009 s->prob.p.mv_mode[c]);
2010 s->counts.mv_mode[c][b->mode[2] - 10]++;
2011 fill_mv(s, b->mv[2], b->mode[2], 2);
2013 if (b->bs != BS_8x4) {
2014 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2015 s->prob.p.mv_mode[c]);
2016 s->counts.mv_mode[c][b->mode[3] - 10]++;
2017 fill_mv(s, b->mv[3], b->mode[3], 3);
2019 b->mode[3] = b->mode[2];
2020 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
2021 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
2024 b->mode[2] = b->mode[0];
2025 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2026 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2027 b->mode[3] = b->mode[1];
2028 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2029 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2032 fill_mv(s, b->mv[0], b->mode[0], -1);
2033 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2034 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2035 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2036 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2037 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2038 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2041 vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
2045 #define SPLAT_CTX(var, val, n) \
2047 case 1: var = val; break; \
2048 case 2: AV_WN16A(&var, val * 0x0101); break; \
2049 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2050 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2052 uint64_t v64 = val * 0x0101010101010101ULL; \
2053 AV_WN64A( &var, v64); \
2054 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2059 #define SPLAT_CTX(var, val, n) \
2061 case 1: var = val; break; \
2062 case 2: AV_WN16A(&var, val * 0x0101); break; \
2063 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2065 uint32_t v32 = val * 0x01010101; \
2066 AV_WN32A( &var, v32); \
2067 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2071 uint32_t v32 = val * 0x01010101; \
2072 AV_WN32A( &var, v32); \
2073 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2074 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2075 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2081 switch (bwh_tab[1][b->bs][0]) {
2082 #define SET_CTXS(dir, off, n) \
2084 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2085 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2086 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2087 if (!s->keyframe && !s->intraonly) { \
2088 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2089 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2090 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2092 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2093 if (s->filtermode == FILTER_SWITCHABLE) { \
2094 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2099 case 1: SET_CTXS(above, col, 1); break;
2100 case 2: SET_CTXS(above, col, 2); break;
2101 case 4: SET_CTXS(above, col, 4); break;
2102 case 8: SET_CTXS(above, col, 8); break;
2104 switch (bwh_tab[1][b->bs][1]) {
2105 case 1: SET_CTXS(left, row7, 1); break;
2106 case 2: SET_CTXS(left, row7, 2); break;
2107 case 4: SET_CTXS(left, row7, 4); break;
2108 case 8: SET_CTXS(left, row7, 8); break;
2113 if (!s->keyframe && !s->intraonly) {
2114 if (b->bs > BS_8x8) {
2115 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2117 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2118 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2119 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2120 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2121 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2122 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2123 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2124 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2126 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2128 for (n = 0; n < w4 * 2; n++) {
2129 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2130 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2132 for (n = 0; n < h4 * 2; n++) {
2133 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2134 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2140 for (y = 0; y < h4; y++) {
2141 int x, o = (row + y) * s->sb_cols * 8 + col;
2142 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2145 for (x = 0; x < w4; x++) {
2149 } else if (b->comp) {
2150 for (x = 0; x < w4; x++) {
2151 mv[x].ref[0] = b->ref[0];
2152 mv[x].ref[1] = b->ref[1];
2153 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2154 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2157 for (x = 0; x < w4; x++) {
2158 mv[x].ref[0] = b->ref[0];
2160 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2166 // FIXME merge cnt/eob arguments?
2167 static av_always_inline int
2168 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2169 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2170 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2171 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2172 const int16_t *band_counts, const int16_t *qmul)
2174 int i = 0, band = 0, band_left = band_counts[band];
2175 uint8_t *tp = p[0][nnz];
2176 uint8_t cache[1024];
2181 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2182 eob[band][nnz][val]++;
2187 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2188 cnt[band][nnz][0]++;
2190 band_left = band_counts[++band];
2192 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2194 if (++i == n_coeffs)
2195 break; //invalid input; blocks should end with EOB
2200 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2201 cnt[band][nnz][1]++;
2205 // fill in p[3-10] (model fill) - only once per frame for each pos
2207 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2209 cnt[band][nnz][2]++;
2210 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2211 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2212 cache[rc] = val = 2;
2214 val = 3 + vp56_rac_get_prob(c, tp[5]);
2217 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2219 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2220 val = 5 + vp56_rac_get_prob(c, 159);
2222 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2223 val += vp56_rac_get_prob(c, 145);
2227 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2228 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2229 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2230 val += (vp56_rac_get_prob(c, 148) << 1);
2231 val += vp56_rac_get_prob(c, 140);
2233 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2234 val += (vp56_rac_get_prob(c, 155) << 2);
2235 val += (vp56_rac_get_prob(c, 140) << 1);
2236 val += vp56_rac_get_prob(c, 135);
2238 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2239 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2240 val += (vp56_rac_get_prob(c, 157) << 3);
2241 val += (vp56_rac_get_prob(c, 141) << 2);
2242 val += (vp56_rac_get_prob(c, 134) << 1);
2243 val += vp56_rac_get_prob(c, 130);
2246 if (!is8bitsperpixel) {
2248 val += vp56_rac_get_prob(c, 255) << 17;
2249 val += vp56_rac_get_prob(c, 255) << 16;
2251 val += (vp56_rac_get_prob(c, 255) << 15);
2252 val += (vp56_rac_get_prob(c, 255) << 14);
2254 val += (vp56_rac_get_prob(c, 254) << 13);
2255 val += (vp56_rac_get_prob(c, 254) << 12);
2256 val += (vp56_rac_get_prob(c, 254) << 11);
2257 val += (vp56_rac_get_prob(c, 252) << 10);
2258 val += (vp56_rac_get_prob(c, 249) << 9);
2259 val += (vp56_rac_get_prob(c, 243) << 8);
2260 val += (vp56_rac_get_prob(c, 230) << 7);
2261 val += (vp56_rac_get_prob(c, 196) << 6);
2262 val += (vp56_rac_get_prob(c, 177) << 5);
2263 val += (vp56_rac_get_prob(c, 153) << 4);
2264 val += (vp56_rac_get_prob(c, 140) << 3);
2265 val += (vp56_rac_get_prob(c, 133) << 2);
2266 val += (vp56_rac_get_prob(c, 130) << 1);
2267 val += vp56_rac_get_prob(c, 129);
2271 #define STORE_COEF(c, i, v) do { \
2272 if (is8bitsperpixel) { \
2275 AV_WN32A(&c[i * 2], v); \
2279 band_left = band_counts[++band];
2281 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2283 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2284 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2286 } while (++i < n_coeffs);
2291 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2292 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2293 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2294 const int16_t (*nb)[2], const int16_t *band_counts,
2295 const int16_t *qmul)
2297 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2298 nnz, scan, nb, band_counts, qmul);
2301 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2302 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2303 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2304 const int16_t (*nb)[2], const int16_t *band_counts,
2305 const int16_t *qmul)
2307 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2308 nnz, scan, nb, band_counts, qmul);
2311 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2312 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2313 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2314 const int16_t (*nb)[2], const int16_t *band_counts,
2315 const int16_t *qmul)
2317 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2318 nnz, scan, nb, band_counts, qmul);
2321 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2322 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2323 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2324 const int16_t (*nb)[2], const int16_t *band_counts,
2325 const int16_t *qmul)
2327 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2328 nnz, scan, nb, band_counts, qmul);
2331 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2333 VP9Context *s = ctx->priv_data;
2335 int row = s->row, col = s->col;
2336 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2337 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2338 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2339 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2340 int end_x = FFMIN(2 * (s->cols - col), w4);
2341 int end_y = FFMIN(2 * (s->rows - row), h4);
2342 int n, pl, x, y, res;
2343 int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2344 int tx = 4 * s->lossless + b->tx;
2345 const int16_t * const *yscans = vp9_scans[tx];
2346 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2347 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2348 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2349 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2350 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2351 static const int16_t band_counts[4][8] = {
2352 { 1, 2, 3, 4, 3, 16 - 13 },
2353 { 1, 2, 3, 4, 11, 64 - 21 },
2354 { 1, 2, 3, 4, 11, 256 - 21 },
2355 { 1, 2, 3, 4, 11, 1024 - 21 },
2357 const int16_t *y_band_counts = band_counts[b->tx];
2358 const int16_t *uv_band_counts = band_counts[b->uvtx];
2359 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2360 int total_coeff = 0;
2362 #define MERGE(la, end, step, rd) \
2363 for (n = 0; n < end; n += step) \
2364 la[n] = !!rd(&la[n])
2365 #define MERGE_CTX(step, rd) \
2367 MERGE(l, end_y, step, rd); \
2368 MERGE(a, end_x, step, rd); \
2371 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2372 for (n = 0, y = 0; y < end_y; y += step) { \
2373 for (x = 0; x < end_x; x += step, n += step * step) { \
2374 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2375 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2376 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2377 c, e, p, a[x] + l[y], yscans[txtp], \
2378 ynbs[txtp], y_band_counts, qmul[0]); \
2379 a[x] = l[y] = !!res; \
2380 total_coeff |= !!res; \
2382 AV_WN16A(&s->eob[n], res); \
2389 #define SPLAT(la, end, step, cond) \
2391 for (n = 1; n < end; n += step) \
2392 la[n] = la[n - 1]; \
2393 } else if (step == 4) { \
2395 for (n = 0; n < end; n += step) \
2396 AV_WN32A(&la[n], la[n] * 0x01010101); \
2398 for (n = 0; n < end; n += step) \
2399 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2401 } else /* step == 8 */ { \
2403 if (HAVE_FAST_64BIT) { \
2404 for (n = 0; n < end; n += step) \
2405 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2407 for (n = 0; n < end; n += step) { \
2408 uint32_t v32 = la[n] * 0x01010101; \
2409 AV_WN32A(&la[n], v32); \
2410 AV_WN32A(&la[n + 4], v32); \
2414 for (n = 0; n < end; n += step) \
2415 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2418 #define SPLAT_CTX(step) \
2420 SPLAT(a, end_x, step, end_x == w4); \
2421 SPLAT(l, end_y, step, end_y == h4); \
2427 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2430 MERGE_CTX(2, AV_RN16A);
2431 DECODE_Y_COEF_LOOP(2, 0,);
2435 MERGE_CTX(4, AV_RN32A);
2436 DECODE_Y_COEF_LOOP(4, 0,);
2440 MERGE_CTX(8, AV_RN64A);
2441 DECODE_Y_COEF_LOOP(8, 0, 32);
2446 #define DECODE_UV_COEF_LOOP(step, v) \
2447 for (n = 0, y = 0; y < end_y; y += step) { \
2448 for (x = 0; x < end_x; x += step, n += step * step) { \
2449 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2450 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2451 16 * step * step, c, e, p, a[x] + l[y], \
2452 uvscan, uvnb, uv_band_counts, qmul[1]); \
2453 a[x] = l[y] = !!res; \
2454 total_coeff |= !!res; \
2456 AV_WN16A(&s->uveob[pl][n], res); \
2458 s->uveob[pl][n] = res; \
2463 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2464 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2465 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2470 for (pl = 0; pl < 2; pl++) {
2471 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2472 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2475 DECODE_UV_COEF_LOOP(1,);
2478 MERGE_CTX(2, AV_RN16A);
2479 DECODE_UV_COEF_LOOP(2,);
2483 MERGE_CTX(4, AV_RN32A);
2484 DECODE_UV_COEF_LOOP(4,);
2488 MERGE_CTX(8, AV_RN64A);
2489 DECODE_UV_COEF_LOOP(8, 32);
2498 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2500 return decode_coeffs(ctx, 1);
2503 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2505 return decode_coeffs(ctx, 0);
2508 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2509 uint8_t *dst_edge, ptrdiff_t stride_edge,
2510 uint8_t *dst_inner, ptrdiff_t stride_inner,
2511 uint8_t *l, int col, int x, int w,
2512 int row, int y, enum TxfmMode tx,
2513 int p, int ss_h, int ss_v, int bytesperpixel)
2515 int have_top = row > 0 || y > 0;
2516 int have_left = col > s->tiling.tile_col_start || x > 0;
2517 int have_right = x < w - 1;
2519 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2520 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2521 { DC_127_PRED, VERT_PRED } },
2522 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2523 { HOR_PRED, HOR_PRED } },
2524 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2525 { LEFT_DC_PRED, DC_PRED } },
2526 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2527 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2528 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2529 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2530 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2531 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2532 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2533 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2534 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2535 { DC_127_PRED, VERT_LEFT_PRED } },
2536 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2537 { HOR_UP_PRED, HOR_UP_PRED } },
2538 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2539 { HOR_PRED, TM_VP8_PRED } },
2541 static const struct {
2542 uint8_t needs_left:1;
2543 uint8_t needs_top:1;
2544 uint8_t needs_topleft:1;
2545 uint8_t needs_topright:1;
2546 uint8_t invert_left:1;
2547 } edges[N_INTRA_PRED_MODES] = {
2548 [VERT_PRED] = { .needs_top = 1 },
2549 [HOR_PRED] = { .needs_left = 1 },
2550 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2551 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2552 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2553 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2554 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2555 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2556 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2557 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2558 [LEFT_DC_PRED] = { .needs_left = 1 },
2559 [TOP_DC_PRED] = { .needs_top = 1 },
2560 [DC_128_PRED] = { 0 },
2561 [DC_127_PRED] = { 0 },
2562 [DC_129_PRED] = { 0 }
2565 av_assert2(mode >= 0 && mode < 10);
2566 mode = mode_conv[mode][have_left][have_top];
2567 if (edges[mode].needs_top) {
2568 uint8_t *top, *topleft;
2569 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2570 int n_px_need_tr = 0;
2572 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2575 // if top of sb64-row, use s->intra_pred_data[] instead of
2576 // dst[-stride] for intra prediction (it contains pre- instead of
2577 // post-loopfilter data)
2579 top = !(row & 7) && !y ?
2580 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2581 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2583 topleft = !(row & 7) && !y ?
2584 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2585 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2586 &dst_inner[-stride_inner];
2590 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2591 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2592 n_px_need + n_px_need_tr <= n_px_have) {
2596 if (n_px_need <= n_px_have) {
2597 memcpy(*a, top, n_px_need * bytesperpixel);
2599 #define memset_bpp(c, i1, v, i2, num) do { \
2600 if (bytesperpixel == 1) { \
2601 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2603 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2604 for (n = 0; n < (num); n++) { \
2605 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2609 memcpy(*a, top, n_px_have * bytesperpixel);
2610 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2613 #define memset_val(c, val, num) do { \
2614 if (bytesperpixel == 1) { \
2615 memset((c), (val), (num)); \
2618 for (n = 0; n < (num); n++) { \
2619 AV_WN16A(&(c)[n * 2], (val)); \
2623 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2625 if (edges[mode].needs_topleft) {
2626 if (have_left && have_top) {
2627 #define assign_bpp(c, i1, v, i2) do { \
2628 if (bytesperpixel == 1) { \
2629 (c)[(i1)] = (v)[(i2)]; \
2631 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2634 assign_bpp(*a, -1, topleft, -1);
2636 #define assign_val(c, i, v) do { \
2637 if (bytesperpixel == 1) { \
2640 AV_WN16A(&(c)[(i) * 2], (v)); \
2643 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2646 if (tx == TX_4X4 && edges[mode].needs_topright) {
2647 if (have_top && have_right &&
2648 n_px_need + n_px_need_tr <= n_px_have) {
2649 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2651 memset_bpp(*a, 4, *a, 3, 4);
2656 if (edges[mode].needs_left) {
2658 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2659 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2660 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2662 if (edges[mode].invert_left) {
2663 if (n_px_need <= n_px_have) {
2664 for (i = 0; i < n_px_need; i++)
2665 assign_bpp(l, i, &dst[i * stride], -1);
2667 for (i = 0; i < n_px_have; i++)
2668 assign_bpp(l, i, &dst[i * stride], -1);
2669 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2672 if (n_px_need <= n_px_have) {
2673 for (i = 0; i < n_px_need; i++)
2674 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2676 for (i = 0; i < n_px_have; i++)
2677 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2678 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2682 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2689 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2690 ptrdiff_t uv_off, int bytesperpixel)
2692 VP9Context *s = ctx->priv_data;
2694 int row = s->row, col = s->col;
2695 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2696 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2697 int end_x = FFMIN(2 * (s->cols - col), w4);
2698 int end_y = FFMIN(2 * (s->rows - row), h4);
2699 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2700 int uvstep1d = 1 << b->uvtx, p;
2701 uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2702 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2703 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2705 for (n = 0, y = 0; y < end_y; y += step1d) {
2706 uint8_t *ptr = dst, *ptr_r = dst_r;
2707 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2708 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2709 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2711 uint8_t *a = &a_buf[32];
2712 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2713 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2715 mode = check_intra_mode(s, mode, &a, ptr_r,
2716 s->frames[CUR_FRAME].tf.f->linesize[0],
2717 ptr, s->y_stride, l,
2718 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2719 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2721 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2722 s->block + 16 * n * bytesperpixel, eob);
2724 dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2725 dst += 4 * step1d * s->y_stride;
2732 step = 1 << (b->uvtx * 2);
2733 for (p = 0; p < 2; p++) {
2734 dst = s->dst[1 + p];
2735 dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2736 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2737 uint8_t *ptr = dst, *ptr_r = dst_r;
2738 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2739 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2740 int mode = b->uvmode;
2741 uint8_t *a = &a_buf[32];
2742 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2744 mode = check_intra_mode(s, mode, &a, ptr_r,
2745 s->frames[CUR_FRAME].tf.f->linesize[1],
2746 ptr, s->uv_stride, l, col, x, w4, row, y,
2747 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2748 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2750 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2751 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2753 dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2754 dst += 4 * uvstep1d * s->uv_stride;
2759 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2761 intra_recon(ctx, y_off, uv_off, 1);
2764 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2766 intra_recon(ctx, y_off, uv_off, 2);
2769 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2770 uint8_t *dst, ptrdiff_t dst_stride,
2771 const uint8_t *ref, ptrdiff_t ref_stride,
2772 ThreadFrame *ref_frame,
2773 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2774 int bw, int bh, int w, int h, int bytesperpixel)
2776 int mx = mv->x, my = mv->y, th;
2780 ref += y * ref_stride + x * bytesperpixel;
2783 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2784 // we use +7 because the last 7 pixels of each sbrow can be changed in
2785 // the longest loopfilter of the next sbrow
2786 th = (y + bh + 4 * !!my + 7) >> 6;
2787 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2788 if (x < !!mx * 3 || y < !!my * 3 ||
2789 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2790 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2791 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2793 bw + !!mx * 7, bh + !!my * 7,
2794 x - !!mx * 3, y - !!my * 3, w, h);
2795 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2798 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2801 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2802 uint8_t *dst_u, uint8_t *dst_v,
2803 ptrdiff_t dst_stride,
2804 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2805 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2806 ThreadFrame *ref_frame,
2807 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2808 int bw, int bh, int w, int h, int bytesperpixel)
2810 int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2814 ref_u += y * src_stride_u + x * bytesperpixel;
2815 ref_v += y * src_stride_v + x * bytesperpixel;
2818 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2819 // we use +7 because the last 7 pixels of each sbrow can be changed in
2820 // the longest loopfilter of the next sbrow
2821 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2822 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2823 if (x < !!mx * 3 || y < !!my * 3 ||
2824 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2825 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2826 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2828 bw + !!mx * 7, bh + !!my * 7,
2829 x - !!mx * 3, y - !!my * 3, w, h);
2830 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2831 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2833 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2834 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2836 bw + !!mx * 7, bh + !!my * 7,
2837 x - !!mx * 3, y - !!my * 3, w, h);
2838 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2839 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2841 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2842 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2846 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2847 px, py, pw, ph, bw, bh, w, h, i) \
2848 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2849 mv, bw, bh, w, h, bytesperpixel)
2850 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2851 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2852 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2853 row, col, mv, bw, bh, w, h, bytesperpixel)
2855 #define FN(x) x##_8bpp
2856 #define BYTES_PER_PIXEL 1
2857 #include "vp9_mc_template.c"
2859 #undef BYTES_PER_PIXEL
2860 #define FN(x) x##_16bpp
2861 #define BYTES_PER_PIXEL 2
2862 #include "vp9_mc_template.c"
2864 #undef mc_chroma_dir
2866 #undef BYTES_PER_PIXEL
2869 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2870 vp9_mc_func (*mc)[2],
2871 uint8_t *dst, ptrdiff_t dst_stride,
2872 const uint8_t *ref, ptrdiff_t ref_stride,
2873 ThreadFrame *ref_frame,
2874 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2875 int px, int py, int pw, int ph,
2876 int bw, int bh, int w, int h, int bytesperpixel,
2877 const uint16_t *scale, const uint8_t *step)
2879 if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2880 s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2881 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2882 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2884 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2886 int refbw_m1, refbh_m1;
2890 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2891 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2892 // BUG libvpx seems to scale the two components separately. This introduces
2893 // rounding errors but we have to reproduce them to be exactly compatible
2894 // with the output from libvpx...
2895 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2896 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2900 ref += y * ref_stride + x * bytesperpixel;
2903 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2904 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2905 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2906 // we use +7 because the last 7 pixels of each sbrow can be changed in
2907 // the longest loopfilter of the next sbrow
2908 th = (y + refbh_m1 + 4 + 7) >> 6;
2909 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2910 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2911 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2912 ref - 3 * ref_stride - 3 * bytesperpixel,
2914 refbw_m1 + 8, refbh_m1 + 8,
2915 x - 3, y - 3, w, h);
2916 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2919 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2923 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2924 vp9_mc_func (*mc)[2],
2925 uint8_t *dst_u, uint8_t *dst_v,
2926 ptrdiff_t dst_stride,
2927 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2928 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2929 ThreadFrame *ref_frame,
2930 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2931 int px, int py, int pw, int ph,
2932 int bw, int bh, int w, int h, int bytesperpixel,
2933 const uint16_t *scale, const uint8_t *step)
2935 if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2936 s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2937 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2938 ref_v, src_stride_v, ref_frame,
2939 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2942 int refbw_m1, refbh_m1;
2947 // BUG https://code.google.com/p/webm/issues/detail?id=820
2948 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2949 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2951 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2952 mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2955 // BUG https://code.google.com/p/webm/issues/detail?id=820
2956 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2957 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2959 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2960 my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2965 ref_u += y * src_stride_u + x * bytesperpixel;
2966 ref_v += y * src_stride_v + x * bytesperpixel;
2969 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2970 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2971 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2972 // we use +7 because the last 7 pixels of each sbrow can be changed in
2973 // the longest loopfilter of the next sbrow
2974 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2975 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2976 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2977 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2978 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2980 refbw_m1 + 8, refbh_m1 + 8,
2981 x - 3, y - 3, w, h);
2982 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2983 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2985 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2986 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2988 refbw_m1 + 8, refbh_m1 + 8,
2989 x - 3, y - 3, w, h);
2990 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2991 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2993 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2994 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2999 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
3000 px, py, pw, ph, bw, bh, w, h, i) \
3001 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
3002 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
3003 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
3004 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
3005 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
3006 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
3007 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
3008 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
3010 #define FN(x) x##_scaled_8bpp
3011 #define BYTES_PER_PIXEL 1
3012 #include "vp9_mc_template.c"
3014 #undef BYTES_PER_PIXEL
3015 #define FN(x) x##_scaled_16bpp
3016 #define BYTES_PER_PIXEL 2
3017 #include "vp9_mc_template.c"
3019 #undef mc_chroma_dir
3021 #undef BYTES_PER_PIXEL
3024 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3026 VP9Context *s = ctx->priv_data;
3028 int row = s->row, col = s->col;
3030 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3031 if (bytesperpixel == 1) {
3032 inter_pred_scaled_8bpp(ctx);
3034 inter_pred_scaled_16bpp(ctx);
3037 if (bytesperpixel == 1) {
3038 inter_pred_8bpp(ctx);
3040 inter_pred_16bpp(ctx);
3044 /* mostly copied intra_recon() */
3046 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3047 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3048 int end_x = FFMIN(2 * (s->cols - col), w4);
3049 int end_y = FFMIN(2 * (s->rows - row), h4);
3050 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
3051 int uvstep1d = 1 << b->uvtx, p;
3052 uint8_t *dst = s->dst[0];
3055 for (n = 0, y = 0; y < end_y; y += step1d) {
3057 for (x = 0; x < end_x; x += step1d,
3058 ptr += 4 * step1d * bytesperpixel, n += step) {
3059 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3062 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3063 s->block + 16 * n * bytesperpixel, eob);
3065 dst += 4 * s->y_stride * step1d;
3071 step = 1 << (b->uvtx * 2);
3072 for (p = 0; p < 2; p++) {
3073 dst = s->dst[p + 1];
3074 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3076 for (x = 0; x < end_x; x += uvstep1d,
3077 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3078 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3081 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3082 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3084 dst += 4 * uvstep1d * s->uv_stride;
3090 static void inter_recon_8bpp(AVCodecContext *ctx)
3092 inter_recon(ctx, 1);
3095 static void inter_recon_16bpp(AVCodecContext *ctx)
3097 inter_recon(ctx, 2);
3100 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3101 int row_and_7, int col_and_7,
3102 int w, int h, int col_end, int row_end,
3103 enum TxfmMode tx, int skip_inter)
3105 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3106 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3108 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3109 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3110 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3111 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3113 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3114 // edges. This means that for UV, we work on two subsampled blocks at
3115 // a time, and we only use the topleft block's mode information to set
3116 // things like block strength. Thus, for any block size smaller than
3117 // 16x16, ignore the odd portion of the block.
3118 if (tx == TX_4X4 && (ss_v | ss_h)) {
3133 if (tx == TX_4X4 && !skip_inter) {
3134 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3135 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3136 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3138 for (y = row_and_7; y < h + row_and_7; y++) {
3139 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3141 mask[0][y][1] |= m_row_8;
3142 mask[0][y][2] |= m_row_4;
3143 // for odd lines, if the odd col is not being filtered,
3144 // skip odd row also:
3151 // if a/c are even row/col and b/d are odd, and d is skipped,
3152 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3153 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3154 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3156 mask[1][y][col_mask_id] |= m_col;
3159 mask[0][y][3] |= m_col;
3161 if (ss_h && (col_end & 1))
3162 mask[1][y][3] |= (t << (w - 1)) - t;
3164 mask[1][y][3] |= m_col;
3168 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3171 int mask_id = (tx == TX_8X8);
3172 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3173 int l2 = tx + ss_h - 1, step1d;
3174 int m_row = m_col & masks[l2];
3176 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3177 // 8wd loopfilter to prevent going off the visible edge.
3178 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3179 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3180 int m_row_8 = m_row - m_row_16;
3182 for (y = row_and_7; y < h + row_and_7; y++) {
3183 mask[0][y][0] |= m_row_16;
3184 mask[0][y][1] |= m_row_8;
3187 for (y = row_and_7; y < h + row_and_7; y++)
3188 mask[0][y][mask_id] |= m_row;
3193 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3194 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3195 mask[1][y][0] |= m_col;
3196 if (y - row_and_7 == h - 1)
3197 mask[1][y][1] |= m_col;
3199 for (y = row_and_7; y < h + row_and_7; y += step1d)
3200 mask[1][y][mask_id] |= m_col;
3202 } else if (tx != TX_4X4) {
3205 mask_id = (tx == TX_8X8) || (h == ss_v);
3206 mask[1][row_and_7][mask_id] |= m_col;
3207 mask_id = (tx == TX_8X8) || (w == ss_h);
3208 for (y = row_and_7; y < h + row_and_7; y++)
3209 mask[0][y][mask_id] |= t;
3211 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3213 for (y = row_and_7; y < h + row_and_7; y++) {
3214 mask[0][y][2] |= t4;
3215 mask[0][y][1] |= t8;
3217 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3222 static void decode_b(AVCodecContext *ctx, int row, int col,
3223 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3224 enum BlockLevel bl, enum BlockPartition bp)
3226 VP9Context *s = ctx->priv_data;
3228 enum BlockSize bs = bl * 3 + bp;
3229 int bytesperpixel = s->bytesperpixel;
3230 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3232 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3238 s->min_mv.x = -(128 + col * 64);
3239 s->min_mv.y = -(128 + row * 64);
3240 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3241 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3247 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3248 (s->ss_v && h4 * 2 == (1 << b->tx)));
3253 if (bytesperpixel == 1) {
3254 has_coeffs = decode_coeffs_8bpp(ctx);
3256 has_coeffs = decode_coeffs_16bpp(ctx);
3258 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3260 memset(&s->above_skip_ctx[col], 1, w4);
3261 memset(&s->left_skip_ctx[s->row7], 1, h4);
3266 #define SPLAT_ZERO_CTX(v, n) \
3268 case 1: v = 0; break; \
3269 case 2: AV_ZERO16(&v); break; \
3270 case 4: AV_ZERO32(&v); break; \
3271 case 8: AV_ZERO64(&v); break; \
3272 case 16: AV_ZERO128(&v); break; \
3274 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3276 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3277 if (s->ss_##dir2) { \
3278 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3279 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3281 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3282 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3287 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3288 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3289 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3290 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3293 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3294 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3295 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3296 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3301 s->block += w4 * h4 * 64 * bytesperpixel;
3302 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3303 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3304 s->eob += 4 * w4 * h4;
3305 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3306 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3312 // emulated overhangs if the stride of the target buffer can't hold. This
3313 // makes it possible to support emu-edge and so on even if we have large block
3315 emu[0] = (col + w4) * 8 > f->linesize[0] ||
3316 (row + h4) > s->rows;
3317 emu[1] = (col + w4) * 4 > f->linesize[1] ||
3318 (row + h4) > s->rows;
3320 s->dst[0] = s->tmp_y;
3323 s->dst[0] = f->data[0] + yoff;
3324 s->y_stride = f->linesize[0];
3327 s->dst[1] = s->tmp_uv[0];
3328 s->dst[2] = s->tmp_uv[1];
3331 s->dst[1] = f->data[1] + uvoff;
3332 s->dst[2] = f->data[2] + uvoff;
3333 s->uv_stride = f->linesize[1];
3337 intra_recon_16bpp(ctx, yoff, uvoff);
3339 intra_recon_8bpp(ctx, yoff, uvoff);
3343 inter_recon_16bpp(ctx);
3345 inter_recon_8bpp(ctx);
3349 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3351 for (n = 0; o < w; n++) {
3356 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3357 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3363 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3364 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3366 for (n = s->ss_h; o < w; n++) {
3371 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3372 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3373 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3374 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3380 // pick filter level and find edges to apply filter to
3381 if (s->filter.level &&
3382 (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3383 [b->mode[3] != ZEROMV]) > 0) {
3384 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3385 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3387 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3388 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3389 if (s->ss_h || s->ss_v)
3390 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3391 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3392 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3393 b->uvtx, skip_inter);
3395 if (!s->filter.lim_lut[lvl]) {
3396 int sharp = s->filter.sharpness;
3400 limit >>= (sharp + 3) >> 2;
3401 limit = FFMIN(limit, 9 - sharp);
3403 limit = FFMAX(limit, 1);
3405 s->filter.lim_lut[lvl] = limit;
3406 s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3412 s->block += w4 * h4 * 64 * bytesperpixel;
3413 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3414 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3415 s->eob += 4 * w4 * h4;
3416 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3417 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3421 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3422 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3424 VP9Context *s = ctx->priv_data;
3425 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3426 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3427 const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
3428 s->prob.p.partition[bl][c];
3429 enum BlockPartition bp;
3430 ptrdiff_t hbs = 4 >> bl;
3431 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3432 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3433 int bytesperpixel = s->bytesperpixel;
3436 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3437 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3438 } else if (col + hbs < s->cols) { // FIXME why not <=?
3439 if (row + hbs < s->rows) { // FIXME why not <=?
3440 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3442 case PARTITION_NONE:
3443 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3446 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3447 yoff += hbs * 8 * y_stride;
3448 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3449 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3452 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3453 yoff += hbs * 8 * bytesperpixel;
3454 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3455 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3457 case PARTITION_SPLIT:
3458 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3459 decode_sb(ctx, row, col + hbs, lflvl,
3460 yoff + 8 * hbs * bytesperpixel,
3461 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3462 yoff += hbs * 8 * y_stride;
3463 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3464 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3465 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3466 yoff + 8 * hbs * bytesperpixel,
3467 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3472 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3473 bp = PARTITION_SPLIT;
3474 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3475 decode_sb(ctx, row, col + hbs, lflvl,
3476 yoff + 8 * hbs * bytesperpixel,
3477 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3480 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3482 } else if (row + hbs < s->rows) { // FIXME why not <=?
3483 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3484 bp = PARTITION_SPLIT;
3485 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3486 yoff += hbs * 8 * y_stride;
3487 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3488 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3491 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3494 bp = PARTITION_SPLIT;
3495 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3497 s->counts.partition[bl][c][bp]++;
3500 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3501 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3503 VP9Context *s = ctx->priv_data;
3505 ptrdiff_t hbs = 4 >> bl;
3506 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3507 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3508 int bytesperpixel = s->bytesperpixel;
3511 av_assert2(b->bl == BL_8X8);
3512 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3513 } else if (s->b->bl == bl) {
3514 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3515 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3516 yoff += hbs * 8 * y_stride;
3517 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3518 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3519 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3520 yoff += hbs * 8 * bytesperpixel;
3521 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3522 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3525 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3526 if (col + hbs < s->cols) { // FIXME why not <=?
3527 if (row + hbs < s->rows) {
3528 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3529 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3530 yoff += hbs * 8 * y_stride;
3531 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3532 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3533 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3534 yoff + 8 * hbs * bytesperpixel,
3535 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3537 yoff += hbs * 8 * bytesperpixel;
3538 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3539 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3541 } else if (row + hbs < s->rows) {
3542 yoff += hbs * 8 * y_stride;
3543 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3544 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3549 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3550 uint8_t *lvl, uint8_t (*mask)[4],
3551 uint8_t *dst, ptrdiff_t ls)
3553 int y, x, bytesperpixel = s->bytesperpixel;
3555 // filter edges between columns (e.g. block1 | block2)
3556 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3557 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3558 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3559 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3560 unsigned hm = hm1 | hm2 | hm13 | hm23;
3562 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3565 int L = *l, H = L >> 4;
3566 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3568 if (hmask1[0] & x) {
3569 if (hmask2[0] & x) {
3570 av_assert2(l[8 << ss_v] == L);
3571 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3573 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3575 } else if (hm2 & x) {
3578 E |= s->filter.mblim_lut[L] << 8;
3579 I |= s->filter.lim_lut[L] << 8;
3580 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3582 [0](ptr, ls, E, I, H);
3584 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3585 [0](ptr, ls, E, I, H);
3587 } else if (hm2 & x) {
3588 int L = l[8 << ss_v], H = L >> 4;
3589 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3591 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3592 [0](ptr + 8 * ls, ls, E, I, H);
3600 int L = *l, H = L >> 4;
3601 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3606 E |= s->filter.mblim_lut[L] << 8;
3607 I |= s->filter.lim_lut[L] << 8;
3608 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3610 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3612 } else if (hm23 & x) {
3613 int L = l[8 << ss_v], H = L >> 4;
3614 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3616 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3624 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3625 uint8_t *lvl, uint8_t (*mask)[4],
3626 uint8_t *dst, ptrdiff_t ls)
3628 int y, x, bytesperpixel = s->bytesperpixel;
3631 // filter edges between rows (e.g. ------)
3633 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3634 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3635 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3637 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3640 int L = *l, H = L >> 4;
3641 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3644 if (vmask[0] & (x << (1 + ss_h))) {
3645 av_assert2(l[1 + ss_h] == L);
3646 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3648 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3650 } else if (vm & (x << (1 + ss_h))) {
3653 E |= s->filter.mblim_lut[L] << 8;
3654 I |= s->filter.lim_lut[L] << 8;
3655 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3656 [!!(vmask[1] & (x << (1 + ss_h)))]
3657 [1](ptr, ls, E, I, H);
3659 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3660 [1](ptr, ls, E, I, H);
3662 } else if (vm & (x << (1 + ss_h))) {
3663 int L = l[1 + ss_h], H = L >> 4;
3664 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3666 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3667 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3672 int L = *l, H = L >> 4;
3673 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3675 if (vm3 & (x << (1 + ss_h))) {
3678 E |= s->filter.mblim_lut[L] << 8;
3679 I |= s->filter.lim_lut[L] << 8;
3680 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3682 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3684 } else if (vm3 & (x << (1 + ss_h))) {
3685 int L = l[1 + ss_h], H = L >> 4;
3686 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3688 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3701 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3702 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3704 VP9Context *s = ctx->priv_data;
3705 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3706 uint8_t *dst = f->data[0] + yoff;
3707 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3708 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3711 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3712 // if you think of them as acting on a 8x8 block max, we can interleave
3713 // each v/h within the single x loop, but that only works if we work on
3714 // 8 pixel blocks, and we won't always do that (we want at least 16px
3715 // to use SSE2 optimizations, perhaps 32 for AVX2)
3717 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3718 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3720 for (p = 0; p < 2; p++) {
3721 dst = f->data[1 + p] + uvoff;
3722 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3723 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3727 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3729 int sb_start = ( idx * n) >> log2_n;
3730 int sb_end = ((idx + 1) * n) >> log2_n;
3731 *start = FFMIN(sb_start, n) << 3;
3732 *end = FFMIN(sb_end, n) << 3;
3735 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3736 int max_count, int update_factor)
3738 unsigned ct = ct0 + ct1, p2, p1;
3744 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3745 p2 = av_clip(p2, 1, 255);
3746 ct = FFMIN(ct, max_count);
3747 update_factor = FASTDIV(update_factor * ct, max_count);
3749 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3750 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3753 static void adapt_probs(VP9Context *s)
3756 prob_context *p = &s->prob_ctx[s->framectxid].p;
3757 int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3760 for (i = 0; i < 4; i++)
3761 for (j = 0; j < 2; j++)
3762 for (k = 0; k < 2; k++)
3763 for (l = 0; l < 6; l++)
3764 for (m = 0; m < 6; m++) {
3765 uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3766 unsigned *e = s->counts.eob[i][j][k][l][m];
3767 unsigned *c = s->counts.coef[i][j][k][l][m];
3769 if (l == 0 && m >= 3) // dc only has 3 pt
3772 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3773 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3774 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3777 if (s->keyframe || s->intraonly) {
3778 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3779 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3780 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3781 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3786 for (i = 0; i < 3; i++)
3787 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3790 for (i = 0; i < 4; i++)
3791 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3794 if (s->comppredmode == PRED_SWITCHABLE) {
3795 for (i = 0; i < 5; i++)
3796 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3800 if (s->comppredmode != PRED_SINGLEREF) {
3801 for (i = 0; i < 5; i++)
3802 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3803 s->counts.comp_ref[i][1], 20, 128);
3806 if (s->comppredmode != PRED_COMPREF) {
3807 for (i = 0; i < 5; i++) {
3808 uint8_t *pp = p->single_ref[i];
3809 unsigned (*c)[2] = s->counts.single_ref[i];
3811 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3812 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3816 // block partitioning
3817 for (i = 0; i < 4; i++)
3818 for (j = 0; j < 4; j++) {
3819 uint8_t *pp = p->partition[i][j];
3820 unsigned *c = s->counts.partition[i][j];
3822 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3823 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3824 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3828 if (s->txfmmode == TX_SWITCHABLE) {
3829 for (i = 0; i < 2; i++) {
3830 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3832 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3833 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3834 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3835 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3836 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3837 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3841 // interpolation filter
3842 if (s->filtermode == FILTER_SWITCHABLE) {
3843 for (i = 0; i < 4; i++) {
3844 uint8_t *pp = p->filter[i];
3845 unsigned *c = s->counts.filter[i];
3847 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3848 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3853 for (i = 0; i < 7; i++) {
3854 uint8_t *pp = p->mv_mode[i];
3855 unsigned *c = s->counts.mv_mode[i];
3857 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3858 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3859 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3864 uint8_t *pp = p->mv_joint;
3865 unsigned *c = s->counts.mv_joint;
3867 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3868 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3869 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3873 for (i = 0; i < 2; i++) {
3875 unsigned *c, (*c2)[2], sum;
3877 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3878 s->counts.mv_comp[i].sign[1], 20, 128);
3880 pp = p->mv_comp[i].classes;
3881 c = s->counts.mv_comp[i].classes;
3882 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3883 adapt_prob(&pp[0], c[0], sum, 20, 128);
3885 adapt_prob(&pp[1], c[1], sum, 20, 128);
3887 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3888 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3890 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3891 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3893 adapt_prob(&pp[6], c[6], sum, 20, 128);
3894 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3895 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3896 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3898 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3899 s->counts.mv_comp[i].class0[1], 20, 128);
3900 pp = p->mv_comp[i].bits;
3901 c2 = s->counts.mv_comp[i].bits;
3902 for (j = 0; j < 10; j++)
3903 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3905 for (j = 0; j < 2; j++) {
3906 pp = p->mv_comp[i].class0_fp[j];
3907 c = s->counts.mv_comp[i].class0_fp[j];
3908 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3909 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3910 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3912 pp = p->mv_comp[i].fp;
3913 c = s->counts.mv_comp[i].fp;
3914 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3915 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3916 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3918 if (s->highprecisionmvs) {
3919 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3920 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3921 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3922 s->counts.mv_comp[i].hp[1], 20, 128);
3927 for (i = 0; i < 4; i++) {
3928 uint8_t *pp = p->y_mode[i];
3929 unsigned *c = s->counts.y_mode[i], sum, s2;
3931 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3932 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3933 sum -= c[TM_VP8_PRED];
3934 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3935 sum -= c[VERT_PRED];
3936 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3937 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3939 adapt_prob(&pp[3], s2, sum, 20, 128);
3941 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3942 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3943 sum -= c[DIAG_DOWN_LEFT_PRED];
3944 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3945 sum -= c[VERT_LEFT_PRED];
3946 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3947 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3951 for (i = 0; i < 10; i++) {
3952 uint8_t *pp = p->uv_mode[i];
3953 unsigned *c = s->counts.uv_mode[i], sum, s2;
3955 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3956 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3957 sum -= c[TM_VP8_PRED];
3958 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3959 sum -= c[VERT_PRED];
3960 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3961 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3963 adapt_prob(&pp[3], s2, sum, 20, 128);
3965 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3966 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3967 sum -= c[DIAG_DOWN_LEFT_PRED];
3968 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3969 sum -= c[VERT_LEFT_PRED];
3970 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3971 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3975 static void free_buffers(VP9Context *s)
3977 av_freep(&s->intra_pred_data[0]);
3978 av_freep(&s->b_base);
3979 av_freep(&s->block_base);
3982 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3984 VP9Context *s = ctx->priv_data;
3987 for (i = 0; i < 3; i++) {
3988 if (s->frames[i].tf.f->data[0])
3989 vp9_unref_frame(ctx, &s->frames[i]);
3990 av_frame_free(&s->frames[i].tf.f);
3992 for (i = 0; i < 8; i++) {
3993 if (s->refs[i].f->data[0])
3994 ff_thread_release_buffer(ctx, &s->refs[i]);
3995 av_frame_free(&s->refs[i].f);
3996 if (s->next_refs[i].f->data[0])
3997 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3998 av_frame_free(&s->next_refs[i].f);
4008 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
4009 int *got_frame, AVPacket *pkt)
4011 const uint8_t *data = pkt->data;
4012 int size = pkt->size;
4013 VP9Context *s = ctx->priv_data;
4014 int res, tile_row, tile_col, i, ref, row, col;
4015 int retain_segmap_ref = s->frames[REF_FRAME_SEGMAP].segmentation_map &&
4016 (!s->segmentation.enabled || !s->segmentation.update_map);
4017 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
4021 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
4023 } else if (res == 0) {
4024 if (!s->refs[ref].f->data[0]) {
4025 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4026 return AVERROR_INVALIDDATA;
4028 if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
4030 ((AVFrame *)frame)->pkt_pts = pkt->pts;
4031 ((AVFrame *)frame)->pkt_dts = pkt->dts;
4032 for (i = 0; i < 8; i++) {
4033 if (s->next_refs[i].f->data[0])
4034 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4035 if (s->refs[i].f->data[0] &&
4036 (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
4045 if (!retain_segmap_ref || s->keyframe || s->intraonly) {
4046 if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0])
4047 vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
4048 if (!s->keyframe && !s->intraonly && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4049 (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_SEGMAP], &s->frames[CUR_FRAME])) < 0)
4052 if (s->frames[REF_FRAME_MVPAIR].tf.f->data[0])
4053 vp9_unref_frame(ctx, &s->frames[REF_FRAME_MVPAIR]);
4054 if (!s->intraonly && !s->keyframe && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4055 (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_MVPAIR], &s->frames[CUR_FRAME])) < 0)
4057 if (s->frames[CUR_FRAME].tf.f->data[0])
4058 vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
4059 if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
4061 f = s->frames[CUR_FRAME].tf.f;
4062 f->key_frame = s->keyframe;
4063 f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4064 ls_y = f->linesize[0];
4065 ls_uv =f->linesize[1];
4068 for (i = 0; i < 8; i++) {
4069 if (s->next_refs[i].f->data[0])
4070 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4071 if (s->refreshrefmask & (1 << i)) {
4072 res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
4073 } else if (s->refs[i].f->data[0]) {
4074 res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
4080 // main tile decode loop
4081 bytesperpixel = s->bytesperpixel;
4082 memset(s->above_partition_ctx, 0, s->cols);
4083 memset(s->above_skip_ctx, 0, s->cols);
4084 if (s->keyframe || s->intraonly) {
4085 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4087 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4089 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4090 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4091 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4092 memset(s->above_segpred_ctx, 0, s->cols);
4093 s->pass = s->frames[CUR_FRAME].uses_2pass =
4094 ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
4095 if ((res = update_block_buffers(ctx)) < 0) {
4096 av_log(ctx, AV_LOG_ERROR,
4097 "Failed to allocate block buffers\n");
4100 if (s->refreshctx && s->parallelmode) {
4103 for (i = 0; i < 4; i++) {
4104 for (j = 0; j < 2; j++)
4105 for (k = 0; k < 2; k++)
4106 for (l = 0; l < 6; l++)
4107 for (m = 0; m < 6; m++)
4108 memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
4109 s->prob.coef[i][j][k][l][m], 3);
4110 if (s->txfmmode == i)
4113 s->prob_ctx[s->framectxid].p = s->prob.p;
4114 ff_thread_finish_setup(ctx);
4115 } else if (!s->refreshctx) {
4116 ff_thread_finish_setup(ctx);
4122 s->block = s->block_base;
4123 s->uvblock[0] = s->uvblock_base[0];
4124 s->uvblock[1] = s->uvblock_base[1];
4125 s->eob = s->eob_base;
4126 s->uveob[0] = s->uveob_base[0];
4127 s->uveob[1] = s->uveob_base[1];
4129 for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
4130 set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
4131 tile_row, s->tiling.log2_tile_rows, s->sb_rows);
4133 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4136 if (tile_col == s->tiling.tile_cols - 1 &&
4137 tile_row == s->tiling.tile_rows - 1) {
4140 tile_size = AV_RB32(data);
4144 if (tile_size > size) {
4145 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4146 return AVERROR_INVALIDDATA;
4148 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4149 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4150 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4151 return AVERROR_INVALIDDATA;
4158 for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
4159 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4160 struct VP9Filter *lflvl_ptr = s->lflvl;
4161 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4163 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4164 set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
4165 tile_col, s->tiling.log2_tile_cols, s->sb_cols);
4168 memset(s->left_partition_ctx, 0, 8);
4169 memset(s->left_skip_ctx, 0, 8);
4170 if (s->keyframe || s->intraonly) {
4171 memset(s->left_mode_ctx, DC_PRED, 16);
4173 memset(s->left_mode_ctx, NEARESTMV, 8);
4175 memset(s->left_y_nnz_ctx, 0, 16);
4176 memset(s->left_uv_nnz_ctx, 0, 32);
4177 memset(s->left_segpred_ctx, 0, 8);
4179 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4182 for (col = s->tiling.tile_col_start;
4183 col < s->tiling.tile_col_end;
4184 col += 8, yoff2 += 64 * bytesperpixel,
4185 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4186 // FIXME integrate with lf code (i.e. zero after each
4187 // use, similar to invtxfm coefficients, or similar)
4189 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4193 decode_sb_mem(ctx, row, col, lflvl_ptr,
4194 yoff2, uvoff2, BL_64X64);
4196 decode_sb(ctx, row, col, lflvl_ptr,
4197 yoff2, uvoff2, BL_64X64);
4201 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4209 // backup pre-loopfilter reconstruction data for intra
4210 // prediction of next row of sb64s
4211 if (row + 8 < s->rows) {
4212 memcpy(s->intra_pred_data[0],
4213 f->data[0] + yoff + 63 * ls_y,
4214 8 * s->cols * bytesperpixel);
4215 memcpy(s->intra_pred_data[1],
4216 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4217 8 * s->cols * bytesperpixel >> s->ss_h);
4218 memcpy(s->intra_pred_data[2],
4219 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4220 8 * s->cols * bytesperpixel >> s->ss_h);
4223 // loopfilter one row
4224 if (s->filter.level) {
4227 lflvl_ptr = s->lflvl;
4228 for (col = 0; col < s->cols;
4229 col += 8, yoff2 += 64 * bytesperpixel,
4230 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4231 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4235 // FIXME maybe we can make this more finegrained by running the
4236 // loopfilter per-block instead of after each sbrow
4237 // In fact that would also make intra pred left preparation easier?
4238 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
4242 if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
4244 ff_thread_finish_setup(ctx);
4246 } while (s->pass++ == 1);
4247 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4250 for (i = 0; i < 8; i++) {
4251 if (s->refs[i].f->data[0])
4252 ff_thread_release_buffer(ctx, &s->refs[i]);
4253 if (s->next_refs[i].f->data[0] &&
4254 (res = ff_thread_ref_frame(&s->refs[i], &s->next_refs[i])) < 0)
4258 if (!s->invisible) {
4259 if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
4267 static void vp9_decode_flush(AVCodecContext *ctx)
4269 VP9Context *s = ctx->priv_data;
4272 for (i = 0; i < 3; i++)
4273 vp9_unref_frame(ctx, &s->frames[i]);
4274 for (i = 0; i < 8; i++)
4275 ff_thread_release_buffer(ctx, &s->refs[i]);
4278 static int init_frames(AVCodecContext *ctx)
4280 VP9Context *s = ctx->priv_data;
4283 for (i = 0; i < 3; i++) {
4284 s->frames[i].tf.f = av_frame_alloc();
4285 if (!s->frames[i].tf.f) {
4286 vp9_decode_free(ctx);
4287 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4288 return AVERROR(ENOMEM);
4291 for (i = 0; i < 8; i++) {
4292 s->refs[i].f = av_frame_alloc();
4293 s->next_refs[i].f = av_frame_alloc();
4294 if (!s->refs[i].f || !s->next_refs[i].f) {
4295 vp9_decode_free(ctx);
4296 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4297 return AVERROR(ENOMEM);
4304 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4306 VP9Context *s = ctx->priv_data;
4308 ctx->internal->allocate_progress = 1;
4310 s->filter.sharpness = -1;
4312 return init_frames(ctx);
4315 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4317 return init_frames(avctx);
4320 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4323 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4325 // detect size changes in other threads
4326 if (s->intra_pred_data[0] &&
4327 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4331 for (i = 0; i < 3; i++) {
4332 if (s->frames[i].tf.f->data[0])
4333 vp9_unref_frame(dst, &s->frames[i]);
4334 if (ssrc->frames[i].tf.f->data[0]) {
4335 if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4339 for (i = 0; i < 8; i++) {
4340 if (s->refs[i].f->data[0])
4341 ff_thread_release_buffer(dst, &s->refs[i]);
4342 if (ssrc->next_refs[i].f->data[0]) {
4343 if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4348 s->invisible = ssrc->invisible;
4349 s->keyframe = ssrc->keyframe;
4350 s->intraonly = ssrc->intraonly;
4351 s->ss_v = ssrc->ss_v;
4352 s->ss_h = ssrc->ss_h;
4353 s->segmentation.enabled = ssrc->segmentation.enabled;
4354 s->segmentation.update_map = ssrc->segmentation.update_map;
4355 s->segmentation.absolute_vals = ssrc->segmentation.absolute_vals;
4356 s->bytesperpixel = ssrc->bytesperpixel;
4358 s->bpp_index = ssrc->bpp_index;
4359 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4360 memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4361 memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4362 sizeof(s->segmentation.feat));
4367 static const AVProfile profiles[] = {
4368 { FF_PROFILE_VP9_0, "Profile 0" },
4369 { FF_PROFILE_VP9_1, "Profile 1" },
4370 { FF_PROFILE_VP9_2, "Profile 2" },
4371 { FF_PROFILE_VP9_3, "Profile 3" },
4372 { FF_PROFILE_UNKNOWN },
4375 AVCodec ff_vp9_decoder = {
4377 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4378 .type = AVMEDIA_TYPE_VIDEO,
4379 .id = AV_CODEC_ID_VP9,
4380 .priv_data_size = sizeof(VP9Context),
4381 .init = vp9_decode_init,
4382 .close = vp9_decode_free,
4383 .decode = vp9_decode_frame,
4384 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4385 .flush = vp9_decode_flush,
4386 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4387 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4388 .profiles = NULL_IF_CONFIG_SMALL(profiles),