2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include "libavutil/avassert.h"
34 #include "libavutil/pixdesc.h"
36 #define VP9_SYNCCODE 0x498342
73 typedef struct VP9Frame {
75 AVBufferRef *extradata;
76 uint8_t *segmentation_map;
77 struct VP9mvrefPair *mv;
83 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
84 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
87 typedef struct VP9Block {
88 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
89 enum FilterMode filter;
90 VP56mv mv[4 /* b_idx */][2 /* ref */];
92 enum TxfmMode tx, uvtx;
94 enum BlockPartition bp;
97 typedef struct VP9Context {
104 VP9Block *b_base, *b;
106 int row, row7, col, col7;
108 ptrdiff_t y_stride, uv_stride;
111 uint8_t keyframe, last_keyframe;
112 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
114 uint8_t use_last_frame_mvs;
119 uint8_t refreshrefmask;
120 uint8_t highprecisionmvs;
121 enum FilterMode filtermode;
122 uint8_t allowcompinter;
125 uint8_t parallelmode;
129 uint8_t varcompref[2];
130 ThreadFrame refs[8], next_refs[8];
132 #define REF_FRAME_MVPAIR 1
133 #define REF_FRAME_SEGMAP 2
140 uint8_t mblim_lut[64];
148 int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
150 #define MAX_SEGMENT 8
154 uint8_t absolute_vals;
156 uint8_t ignore_refmap;
161 uint8_t skip_enabled;
170 unsigned log2_tile_cols, log2_tile_rows;
171 unsigned tile_cols, tile_rows;
172 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
174 unsigned sb_cols, sb_rows, rows, cols;
177 uint8_t coef[4][2][2][6][6][3];
181 uint8_t coef[4][2][2][6][6][11];
186 unsigned y_mode[4][10];
187 unsigned uv_mode[10][10];
188 unsigned filter[4][3];
189 unsigned mv_mode[7][4];
190 unsigned intra[4][2];
192 unsigned single_ref[5][2][2];
193 unsigned comp_ref[5][2];
194 unsigned tx32p[2][4];
195 unsigned tx16p[2][3];
198 unsigned mv_joint[4];
201 unsigned classes[11];
203 unsigned bits[10][2];
204 unsigned class0_fp[2][4];
206 unsigned class0_hp[2];
209 unsigned partition[4][4][4];
210 unsigned coef[4][2][2][6][6][3];
211 unsigned eob[4][2][2][6][6][2];
213 enum TxfmMode txfmmode;
214 enum CompPredMode comppredmode;
216 // contextual (left/above) cache
217 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
218 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
219 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
220 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
221 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
222 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
223 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
224 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
225 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
226 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
227 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
228 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
229 uint8_t *above_partition_ctx;
230 uint8_t *above_mode_ctx;
231 // FIXME maybe merge some of the below in a flags field?
232 uint8_t *above_y_nnz_ctx;
233 uint8_t *above_uv_nnz_ctx[2];
234 uint8_t *above_skip_ctx; // 1bit
235 uint8_t *above_txfm_ctx; // 2bit
236 uint8_t *above_segpred_ctx; // 1bit
237 uint8_t *above_intra_ctx; // 1bit
238 uint8_t *above_comp_ctx; // 1bit
239 uint8_t *above_ref_ctx; // 2bit
240 uint8_t *above_filter_ctx;
241 VP56mv (*above_mv_ctx)[2];
244 uint8_t *intra_pred_data[3];
245 struct VP9Filter *lflvl;
246 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
248 // block reconstruction intermediates
249 int block_alloc_using_2pass;
250 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
251 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
252 struct { int x, y; } min_mv, max_mv;
253 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
254 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
255 uint16_t mvscale[3][2];
256 uint8_t mvstep[3][2];
259 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
261 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
262 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
264 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
265 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
269 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
271 VP9Context *s = ctx->priv_data;
274 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
276 sz = 64 * s->sb_cols * s->sb_rows;
277 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
278 ff_thread_release_buffer(ctx, &f->tf);
279 return AVERROR(ENOMEM);
282 f->segmentation_map = f->extradata->data;
283 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
288 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
290 ff_thread_release_buffer(ctx, &f->tf);
291 av_buffer_unref(&f->extradata);
294 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
298 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
300 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301 vp9_unref_frame(ctx, dst);
302 return AVERROR(ENOMEM);
305 dst->segmentation_map = src->segmentation_map;
307 dst->uses_2pass = src->uses_2pass;
312 static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
314 VP9Context *s = ctx->priv_data;
316 int bytesperpixel = s->bytesperpixel;
318 av_assert0(w > 0 && h > 0);
320 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
326 s->sb_cols = (w + 63) >> 6;
327 s->sb_rows = (h + 63) >> 6;
328 s->cols = (w + 7) >> 3;
329 s->rows = (h + 7) >> 3;
331 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
332 av_freep(&s->intra_pred_data[0]);
333 // FIXME we slightly over-allocate here for subsampled chroma, but a little
334 // bit of padding shouldn't affect performance...
335 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
336 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
338 return AVERROR(ENOMEM);
339 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
340 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
341 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
342 assign(s->above_y_nnz_ctx, uint8_t *, 16);
343 assign(s->above_mode_ctx, uint8_t *, 16);
344 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
345 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
346 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
347 assign(s->above_partition_ctx, uint8_t *, 8);
348 assign(s->above_skip_ctx, uint8_t *, 8);
349 assign(s->above_txfm_ctx, uint8_t *, 8);
350 assign(s->above_segpred_ctx, uint8_t *, 8);
351 assign(s->above_intra_ctx, uint8_t *, 8);
352 assign(s->above_comp_ctx, uint8_t *, 8);
353 assign(s->above_ref_ctx, uint8_t *, 8);
354 assign(s->above_filter_ctx, uint8_t *, 8);
355 assign(s->lflvl, struct VP9Filter *, 1);
358 // these will be re-allocated a little later
359 av_freep(&s->b_base);
360 av_freep(&s->block_base);
362 if (s->bpp != s->last_bpp) {
363 ff_vp9dsp_init(&s->dsp, s->bpp);
364 ff_videodsp_init(&s->vdsp, s->bpp);
365 s->last_bpp = s->bpp;
371 static int update_block_buffers(AVCodecContext *ctx)
373 VP9Context *s = ctx->priv_data;
374 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
376 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
380 av_free(s->block_base);
381 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
382 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
383 if (s->frames[CUR_FRAME].uses_2pass) {
384 int sbs = s->sb_cols * s->sb_rows;
386 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
387 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
388 16 * 16 + 2 * chroma_eobs) * sbs);
389 if (!s->b_base || !s->block_base)
390 return AVERROR(ENOMEM);
391 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
392 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
393 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
394 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
395 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
397 s->b_base = av_malloc(sizeof(VP9Block));
398 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
399 16 * 16 + 2 * chroma_eobs);
400 if (!s->b_base || !s->block_base)
401 return AVERROR(ENOMEM);
402 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
403 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
404 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
405 s->uveob_base[0] = s->eob_base + 16 * 16;
406 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
408 s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
413 // for some reason the sign bit is at the end, not the start, of a bit sequence
414 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
416 int v = get_bits(gb, n);
417 return get_bits1(gb) ? -v : v;
420 static av_always_inline int inv_recenter_nonneg(int v, int m)
422 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
425 // differential forward probability updates
426 static int update_prob(VP56RangeCoder *c, int p)
428 static const int inv_map_table[254] = {
429 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
430 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
431 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
432 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
433 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
434 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
435 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
436 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
437 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
438 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
439 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
440 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
441 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
442 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
443 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
444 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
445 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
446 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
451 /* This code is trying to do a differential probability update. For a
452 * current probability A in the range [1, 255], the difference to a new
453 * probability of any value can be expressed differentially as 1-A,255-A
454 * where some part of this (absolute range) exists both in positive as
455 * well as the negative part, whereas another part only exists in one
456 * half. We're trying to code this shared part differentially, i.e.
457 * times two where the value of the lowest bit specifies the sign, and
458 * the single part is then coded on top of this. This absolute difference
459 * then again has a value of [0,254], but a bigger value in this range
460 * indicates that we're further away from the original value A, so we
461 * can code this as a VLC code, since higher values are increasingly
462 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
463 * updates vs. the 'fine, exact' updates further down the range, which
464 * adds one extra dimension to this differential update model. */
466 if (!vp8_rac_get(c)) {
467 d = vp8_rac_get_uint(c, 4) + 0;
468 } else if (!vp8_rac_get(c)) {
469 d = vp8_rac_get_uint(c, 4) + 16;
470 } else if (!vp8_rac_get(c)) {
471 d = vp8_rac_get_uint(c, 5) + 32;
473 d = vp8_rac_get_uint(c, 7);
475 d = (d << 1) - 65 + vp8_rac_get(c);
479 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
480 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
483 static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
485 static const enum AVColorSpace colorspaces[8] = {
486 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
487 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
489 VP9Context *s = ctx->priv_data;
490 enum AVPixelFormat res;
491 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
494 s->bpp = 8 + bits * 2;
495 s->bytesperpixel = (7 + s->bpp) >> 3;
496 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
497 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
498 static const enum AVPixelFormat pix_fmt_rgb[3] = {
499 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
501 if (ctx->profile & 1) {
502 s->ss_h = s->ss_v = 1;
503 res = pix_fmt_rgb[bits];
504 ctx->color_range = AVCOL_RANGE_JPEG;
506 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
508 return AVERROR_INVALIDDATA;
511 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
512 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
513 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
514 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
515 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
516 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
517 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
519 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
520 if (ctx->profile & 1) {
521 s->ss_h = get_bits1(&s->gb);
522 s->ss_v = get_bits1(&s->gb);
523 if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
524 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
526 return AVERROR_INVALIDDATA;
527 } else if (get_bits1(&s->gb)) {
528 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
530 return AVERROR_INVALIDDATA;
533 s->ss_h = s->ss_v = 1;
534 res = pix_fmt_for_ss[bits][1][1];
541 static int decode_frame_header(AVCodecContext *ctx,
542 const uint8_t *data, int size, int *ref)
544 VP9Context *s = ctx->priv_data;
545 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
546 enum AVPixelFormat fmt = ctx->pix_fmt;
548 const uint8_t *data2;
551 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
552 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
555 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
556 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
557 return AVERROR_INVALIDDATA;
559 ctx->profile = get_bits1(&s->gb);
560 ctx->profile |= get_bits1(&s->gb) << 1;
561 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
562 if (ctx->profile > 3) {
563 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
564 return AVERROR_INVALIDDATA;
566 if (get_bits1(&s->gb)) {
567 *ref = get_bits(&s->gb, 3);
570 s->last_keyframe = s->keyframe;
571 s->keyframe = !get_bits1(&s->gb);
572 last_invisible = s->invisible;
573 s->invisible = !get_bits1(&s->gb);
574 s->errorres = get_bits1(&s->gb);
575 s->use_last_frame_mvs = !s->errorres && !last_invisible;
577 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
578 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
579 return AVERROR_INVALIDDATA;
581 if ((fmt = read_colorspace_details(ctx)) < 0)
583 // for profile 1, here follows the subsampling bits
584 s->refreshrefmask = 0xff;
585 w = get_bits(&s->gb, 16) + 1;
586 h = get_bits(&s->gb, 16) + 1;
587 if (get_bits1(&s->gb)) // display size
588 skip_bits(&s->gb, 32);
590 s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
591 s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
593 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
594 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
595 return AVERROR_INVALIDDATA;
597 if (ctx->profile == 1) {
598 if ((fmt = read_colorspace_details(ctx)) < 0)
601 s->ss_h = s->ss_v = 1;
604 s->bytesperpixel = 1;
605 fmt = AV_PIX_FMT_YUV420P;
606 ctx->colorspace = AVCOL_SPC_BT470BG;
607 ctx->color_range = AVCOL_RANGE_JPEG;
609 s->refreshrefmask = get_bits(&s->gb, 8);
610 w = get_bits(&s->gb, 16) + 1;
611 h = get_bits(&s->gb, 16) + 1;
612 if (get_bits1(&s->gb)) // display size
613 skip_bits(&s->gb, 32);
615 s->refreshrefmask = get_bits(&s->gb, 8);
616 s->refidx[0] = get_bits(&s->gb, 3);
617 s->signbias[0] = get_bits1(&s->gb);
618 s->refidx[1] = get_bits(&s->gb, 3);
619 s->signbias[1] = get_bits1(&s->gb);
620 s->refidx[2] = get_bits(&s->gb, 3);
621 s->signbias[2] = get_bits1(&s->gb);
622 if (!s->refs[s->refidx[0]].f->data[0] ||
623 !s->refs[s->refidx[1]].f->data[0] ||
624 !s->refs[s->refidx[2]].f->data[0]) {
625 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
626 return AVERROR_INVALIDDATA;
628 if (get_bits1(&s->gb)) {
629 w = s->refs[s->refidx[0]].f->width;
630 h = s->refs[s->refidx[0]].f->height;
631 } else if (get_bits1(&s->gb)) {
632 w = s->refs[s->refidx[1]].f->width;
633 h = s->refs[s->refidx[1]].f->height;
634 } else if (get_bits1(&s->gb)) {
635 w = s->refs[s->refidx[2]].f->width;
636 h = s->refs[s->refidx[2]].f->height;
638 w = get_bits(&s->gb, 16) + 1;
639 h = get_bits(&s->gb, 16) + 1;
641 // Note that in this code, "CUR_FRAME" is actually before we
642 // have formally allocated a frame, and thus actually represents
644 s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
645 s->frames[CUR_FRAME].tf.f->height == h;
646 if (get_bits1(&s->gb)) // display size
647 skip_bits(&s->gb, 32);
648 s->highprecisionmvs = get_bits1(&s->gb);
649 s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
651 s->allowcompinter = s->signbias[0] != s->signbias[1] ||
652 s->signbias[0] != s->signbias[2];
653 if (s->allowcompinter) {
654 if (s->signbias[0] == s->signbias[1]) {
656 s->varcompref[0] = 0;
657 s->varcompref[1] = 1;
658 } else if (s->signbias[0] == s->signbias[2]) {
660 s->varcompref[0] = 0;
661 s->varcompref[1] = 2;
664 s->varcompref[0] = 1;
665 s->varcompref[1] = 2;
669 for (i = 0; i < 3; i++) {
670 AVFrame *ref = s->refs[s->refidx[i]].f;
671 int refw = ref->width, refh = ref->height;
673 if (ref->format != fmt) {
674 av_log(ctx, AV_LOG_ERROR,
675 "Ref pixfmt (%s) did not match current frame (%s)",
676 av_get_pix_fmt_name(ref->format),
677 av_get_pix_fmt_name(fmt));
678 return AVERROR_INVALIDDATA;
679 } else if (refw == w && refh == h) {
680 s->mvscale[i][0] = s->mvscale[i][1] = 0;
682 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
683 av_log(ctx, AV_LOG_ERROR,
684 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
686 return AVERROR_INVALIDDATA;
688 s->mvscale[i][0] = (refw << 14) / w;
689 s->mvscale[i][1] = (refh << 14) / h;
690 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
691 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
696 s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
697 s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
698 s->framectxid = c = get_bits(&s->gb, 2);
700 /* loopfilter header data */
701 if (s->keyframe || s->errorres || s->intraonly) {
702 // reset loopfilter defaults
703 s->lf_delta.ref[0] = 1;
704 s->lf_delta.ref[1] = 0;
705 s->lf_delta.ref[2] = -1;
706 s->lf_delta.ref[3] = -1;
707 s->lf_delta.mode[0] = 0;
708 s->lf_delta.mode[1] = 0;
710 s->filter.level = get_bits(&s->gb, 6);
711 sharp = get_bits(&s->gb, 3);
712 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
713 // the old cache values since they are still valid
714 if (s->filter.sharpness != sharp)
715 memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
716 s->filter.sharpness = sharp;
717 if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
718 if (get_bits1(&s->gb)) {
719 for (i = 0; i < 4; i++)
720 if (get_bits1(&s->gb))
721 s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
722 for (i = 0; i < 2; i++)
723 if (get_bits1(&s->gb))
724 s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
728 /* quantization header data */
729 s->yac_qi = get_bits(&s->gb, 8);
730 s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
731 s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
732 s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
733 s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
734 s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
736 /* segmentation header info */
737 s->segmentation.ignore_refmap = 0;
738 if ((s->segmentation.enabled = get_bits1(&s->gb))) {
739 if ((s->segmentation.update_map = get_bits1(&s->gb))) {
740 for (i = 0; i < 7; i++)
741 s->prob.seg[i] = get_bits1(&s->gb) ?
742 get_bits(&s->gb, 8) : 255;
743 if ((s->segmentation.temporal = get_bits1(&s->gb))) {
744 for (i = 0; i < 3; i++)
745 s->prob.segpred[i] = get_bits1(&s->gb) ?
746 get_bits(&s->gb, 8) : 255;
749 if ((!s->segmentation.update_map || s->segmentation.temporal) &&
750 (w != s->frames[CUR_FRAME].tf.f->width ||
751 h != s->frames[CUR_FRAME].tf.f->height)) {
752 av_log(ctx, AV_LOG_WARNING,
753 "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
754 s->segmentation.temporal, s->segmentation.update_map);
755 s->segmentation.ignore_refmap = 1;
756 //return AVERROR_INVALIDDATA;
759 if (get_bits1(&s->gb)) {
760 s->segmentation.absolute_vals = get_bits1(&s->gb);
761 for (i = 0; i < 8; i++) {
762 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
763 s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
764 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
765 s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
766 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
767 s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
768 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
772 s->segmentation.feat[0].q_enabled = 0;
773 s->segmentation.feat[0].lf_enabled = 0;
774 s->segmentation.feat[0].skip_enabled = 0;
775 s->segmentation.feat[0].ref_enabled = 0;
778 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
779 for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
780 int qyac, qydc, quvac, quvdc, lflvl, sh;
782 if (s->segmentation.feat[i].q_enabled) {
783 if (s->segmentation.absolute_vals)
784 qyac = s->segmentation.feat[i].q_val;
786 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
790 qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
791 quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
792 quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
793 qyac = av_clip_uintp2(qyac, 8);
795 s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
796 s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
797 s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
798 s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
800 sh = s->filter.level >= 32;
801 if (s->segmentation.feat[i].lf_enabled) {
802 if (s->segmentation.absolute_vals)
803 lflvl = s->segmentation.feat[i].lf_val;
805 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
807 lflvl = s->filter.level;
809 if (s->lf_delta.enabled) {
810 s->segmentation.feat[i].lflvl[0][0] =
811 s->segmentation.feat[i].lflvl[0][1] =
812 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
813 for (j = 1; j < 4; j++) {
814 s->segmentation.feat[i].lflvl[j][0] =
815 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
816 s->lf_delta.mode[0]) * (1 << sh)), 6);
817 s->segmentation.feat[i].lflvl[j][1] =
818 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
819 s->lf_delta.mode[1]) * (1 << sh)), 6);
822 memset(s->segmentation.feat[i].lflvl, lflvl,
823 sizeof(s->segmentation.feat[i].lflvl));
828 if ((res = update_size(ctx, w, h, fmt)) < 0) {
829 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
832 for (s->tiling.log2_tile_cols = 0;
833 (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
834 s->tiling.log2_tile_cols++) ;
835 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
836 max = FFMAX(0, max - 1);
837 while (max > s->tiling.log2_tile_cols) {
838 if (get_bits1(&s->gb))
839 s->tiling.log2_tile_cols++;
843 s->tiling.log2_tile_rows = decode012(&s->gb);
844 s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
845 if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
846 s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
847 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
848 sizeof(VP56RangeCoder) * s->tiling.tile_cols);
850 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
851 return AVERROR(ENOMEM);
855 if (s->keyframe || s->errorres || s->intraonly) {
856 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
857 s->prob_ctx[3].p = vp9_default_probs;
858 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
859 sizeof(vp9_default_coef_probs));
860 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
861 sizeof(vp9_default_coef_probs));
862 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
863 sizeof(vp9_default_coef_probs));
864 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
865 sizeof(vp9_default_coef_probs));
868 // next 16 bits is size of the rest of the header (arith-coded)
869 size2 = get_bits(&s->gb, 16);
870 data2 = align_get_bits(&s->gb);
871 if (size2 > size - (data2 - data)) {
872 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
873 return AVERROR_INVALIDDATA;
875 ff_vp56_init_range_decoder(&s->c, data2, size2);
876 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
877 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
878 return AVERROR_INVALIDDATA;
881 if (s->keyframe || s->intraonly) {
882 memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
884 memset(&s->counts, 0, sizeof(s->counts));
886 // FIXME is it faster to not copy here, but do it down in the fw updates
887 // as explicit copies if the fw update is missing (and skip the copy upon
889 s->prob.p = s->prob_ctx[c].p;
893 s->txfmmode = TX_4X4;
895 s->txfmmode = vp8_rac_get_uint(&s->c, 2);
896 if (s->txfmmode == 3)
897 s->txfmmode += vp8_rac_get(&s->c);
899 if (s->txfmmode == TX_SWITCHABLE) {
900 for (i = 0; i < 2; i++)
901 if (vp56_rac_get_prob_branchy(&s->c, 252))
902 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
903 for (i = 0; i < 2; i++)
904 for (j = 0; j < 2; j++)
905 if (vp56_rac_get_prob_branchy(&s->c, 252))
906 s->prob.p.tx16p[i][j] =
907 update_prob(&s->c, s->prob.p.tx16p[i][j]);
908 for (i = 0; i < 2; i++)
909 for (j = 0; j < 3; j++)
910 if (vp56_rac_get_prob_branchy(&s->c, 252))
911 s->prob.p.tx32p[i][j] =
912 update_prob(&s->c, s->prob.p.tx32p[i][j]);
917 for (i = 0; i < 4; i++) {
918 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
919 if (vp8_rac_get(&s->c)) {
920 for (j = 0; j < 2; j++)
921 for (k = 0; k < 2; k++)
922 for (l = 0; l < 6; l++)
923 for (m = 0; m < 6; m++) {
924 uint8_t *p = s->prob.coef[i][j][k][l][m];
925 uint8_t *r = ref[j][k][l][m];
926 if (m >= 3 && l == 0) // dc only has 3 pt
928 for (n = 0; n < 3; n++) {
929 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
930 p[n] = update_prob(&s->c, r[n]);
938 for (j = 0; j < 2; j++)
939 for (k = 0; k < 2; k++)
940 for (l = 0; l < 6; l++)
941 for (m = 0; m < 6; m++) {
942 uint8_t *p = s->prob.coef[i][j][k][l][m];
943 uint8_t *r = ref[j][k][l][m];
944 if (m > 3 && l == 0) // dc only has 3 pt
950 if (s->txfmmode == i)
955 for (i = 0; i < 3; i++)
956 if (vp56_rac_get_prob_branchy(&s->c, 252))
957 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
958 if (!s->keyframe && !s->intraonly) {
959 for (i = 0; i < 7; i++)
960 for (j = 0; j < 3; j++)
961 if (vp56_rac_get_prob_branchy(&s->c, 252))
962 s->prob.p.mv_mode[i][j] =
963 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
965 if (s->filtermode == FILTER_SWITCHABLE)
966 for (i = 0; i < 4; i++)
967 for (j = 0; j < 2; j++)
968 if (vp56_rac_get_prob_branchy(&s->c, 252))
969 s->prob.p.filter[i][j] =
970 update_prob(&s->c, s->prob.p.filter[i][j]);
972 for (i = 0; i < 4; i++)
973 if (vp56_rac_get_prob_branchy(&s->c, 252))
974 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
976 if (s->allowcompinter) {
977 s->comppredmode = vp8_rac_get(&s->c);
979 s->comppredmode += vp8_rac_get(&s->c);
980 if (s->comppredmode == PRED_SWITCHABLE)
981 for (i = 0; i < 5; i++)
982 if (vp56_rac_get_prob_branchy(&s->c, 252))
984 update_prob(&s->c, s->prob.p.comp[i]);
986 s->comppredmode = PRED_SINGLEREF;
989 if (s->comppredmode != PRED_COMPREF) {
990 for (i = 0; i < 5; i++) {
991 if (vp56_rac_get_prob_branchy(&s->c, 252))
992 s->prob.p.single_ref[i][0] =
993 update_prob(&s->c, s->prob.p.single_ref[i][0]);
994 if (vp56_rac_get_prob_branchy(&s->c, 252))
995 s->prob.p.single_ref[i][1] =
996 update_prob(&s->c, s->prob.p.single_ref[i][1]);
1000 if (s->comppredmode != PRED_SINGLEREF) {
1001 for (i = 0; i < 5; i++)
1002 if (vp56_rac_get_prob_branchy(&s->c, 252))
1003 s->prob.p.comp_ref[i] =
1004 update_prob(&s->c, s->prob.p.comp_ref[i]);
1007 for (i = 0; i < 4; i++)
1008 for (j = 0; j < 9; j++)
1009 if (vp56_rac_get_prob_branchy(&s->c, 252))
1010 s->prob.p.y_mode[i][j] =
1011 update_prob(&s->c, s->prob.p.y_mode[i][j]);
1013 for (i = 0; i < 4; i++)
1014 for (j = 0; j < 4; j++)
1015 for (k = 0; k < 3; k++)
1016 if (vp56_rac_get_prob_branchy(&s->c, 252))
1017 s->prob.p.partition[3 - i][j][k] =
1018 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1020 // mv fields don't use the update_prob subexp model for some reason
1021 for (i = 0; i < 3; i++)
1022 if (vp56_rac_get_prob_branchy(&s->c, 252))
1023 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1025 for (i = 0; i < 2; i++) {
1026 if (vp56_rac_get_prob_branchy(&s->c, 252))
1027 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1029 for (j = 0; j < 10; j++)
1030 if (vp56_rac_get_prob_branchy(&s->c, 252))
1031 s->prob.p.mv_comp[i].classes[j] =
1032 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1034 if (vp56_rac_get_prob_branchy(&s->c, 252))
1035 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1037 for (j = 0; j < 10; j++)
1038 if (vp56_rac_get_prob_branchy(&s->c, 252))
1039 s->prob.p.mv_comp[i].bits[j] =
1040 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1043 for (i = 0; i < 2; i++) {
1044 for (j = 0; j < 2; j++)
1045 for (k = 0; k < 3; k++)
1046 if (vp56_rac_get_prob_branchy(&s->c, 252))
1047 s->prob.p.mv_comp[i].class0_fp[j][k] =
1048 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1050 for (j = 0; j < 3; j++)
1051 if (vp56_rac_get_prob_branchy(&s->c, 252))
1052 s->prob.p.mv_comp[i].fp[j] =
1053 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1056 if (s->highprecisionmvs) {
1057 for (i = 0; i < 2; i++) {
1058 if (vp56_rac_get_prob_branchy(&s->c, 252))
1059 s->prob.p.mv_comp[i].class0_hp =
1060 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1062 if (vp56_rac_get_prob_branchy(&s->c, 252))
1063 s->prob.p.mv_comp[i].hp =
1064 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1069 return (data2 - data) + size2;
1072 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1075 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1076 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1079 static void find_ref_mvs(VP9Context *s,
1080 VP56mv *pmv, int ref, int z, int idx, int sb)
1082 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1083 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1084 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1085 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1086 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1087 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1088 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1089 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1090 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1091 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1092 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1093 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1094 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1095 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1096 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1097 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1098 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1099 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1100 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1101 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1102 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1103 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1104 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1105 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1106 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1107 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1108 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1111 int row = s->row, col = s->col, row7 = s->row7;
1112 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1113 #define INVALID_MV 0x80008000U
1114 uint32_t mem = INVALID_MV;
1117 #define RETURN_DIRECT_MV(mv) \
1119 uint32_t m = AV_RN32A(&mv); \
1123 } else if (mem == INVALID_MV) { \
1125 } else if (m != mem) { \
1132 if (sb == 2 || sb == 1) {
1133 RETURN_DIRECT_MV(b->mv[0][z]);
1134 } else if (sb == 3) {
1135 RETURN_DIRECT_MV(b->mv[2][z]);
1136 RETURN_DIRECT_MV(b->mv[1][z]);
1137 RETURN_DIRECT_MV(b->mv[0][z]);
1140 #define RETURN_MV(mv) \
1145 clamp_mv(&tmp, &mv, s); \
1146 m = AV_RN32A(&tmp); \
1150 } else if (mem == INVALID_MV) { \
1152 } else if (m != mem) { \
1157 uint32_t m = AV_RN32A(&mv); \
1159 clamp_mv(pmv, &mv, s); \
1161 } else if (mem == INVALID_MV) { \
1163 } else if (m != mem) { \
1164 clamp_mv(pmv, &mv, s); \
1171 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1172 if (mv->ref[0] == ref) {
1173 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1174 } else if (mv->ref[1] == ref) {
1175 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1178 if (col > s->tiling.tile_col_start) {
1179 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1180 if (mv->ref[0] == ref) {
1181 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1182 } else if (mv->ref[1] == ref) {
1183 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1191 // previously coded MVs in this neighbourhood, using same reference frame
1192 for (; i < 8; i++) {
1193 int c = p[i][0] + col, r = p[i][1] + row;
1195 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1196 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1198 if (mv->ref[0] == ref) {
1199 RETURN_MV(mv->mv[0]);
1200 } else if (mv->ref[1] == ref) {
1201 RETURN_MV(mv->mv[1]);
1206 // MV at this position in previous frame, using same reference frame
1207 if (s->use_last_frame_mvs) {
1208 struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1210 if (!s->frames[REF_FRAME_MVPAIR].uses_2pass)
1211 ff_thread_await_progress(&s->frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1212 if (mv->ref[0] == ref) {
1213 RETURN_MV(mv->mv[0]);
1214 } else if (mv->ref[1] == ref) {
1215 RETURN_MV(mv->mv[1]);
1219 #define RETURN_SCALE_MV(mv, scale) \
1222 VP56mv mv_temp = { -mv.x, -mv.y }; \
1223 RETURN_MV(mv_temp); \
1229 // previously coded MVs in this neighbourhood, using different reference frame
1230 for (i = 0; i < 8; i++) {
1231 int c = p[i][0] + col, r = p[i][1] + row;
1233 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1234 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1236 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1237 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1239 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1240 // BUG - libvpx has this condition regardless of whether
1241 // we used the first ref MV and pre-scaling
1242 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1243 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1248 // MV at this position in previous frame, using different reference frame
1249 if (s->use_last_frame_mvs) {
1250 struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1252 // no need to await_progress, because we already did that above
1253 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1254 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1256 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1257 // BUG - libvpx has this condition regardless of whether
1258 // we used the first ref MV and pre-scaling
1259 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1260 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1267 #undef RETURN_SCALE_MV
1270 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1272 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1273 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1274 s->prob.p.mv_comp[idx].classes);
1276 s->counts.mv_comp[idx].sign[sign]++;
1277 s->counts.mv_comp[idx].classes[c]++;
1281 for (n = 0, m = 0; m < c; m++) {
1282 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1284 s->counts.mv_comp[idx].bits[m][bit]++;
1287 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1289 s->counts.mv_comp[idx].fp[bit]++;
1291 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1292 s->counts.mv_comp[idx].hp[bit]++;
1296 // bug in libvpx - we count for bw entropy purposes even if the
1298 s->counts.mv_comp[idx].hp[1]++;
1302 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1303 s->counts.mv_comp[idx].class0[n]++;
1304 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1305 s->prob.p.mv_comp[idx].class0_fp[n]);
1306 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1307 n = (n << 3) | (bit << 1);
1309 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1310 s->counts.mv_comp[idx].class0_hp[bit]++;
1314 // bug in libvpx - we count for bw entropy purposes even if the
1316 s->counts.mv_comp[idx].class0_hp[1]++;
1320 return sign ? -(n + 1) : (n + 1);
1323 static void fill_mv(VP9Context *s,
1324 VP56mv *mv, int mode, int sb)
1328 if (mode == ZEROMV) {
1333 // FIXME cache this value and reuse for other subblocks
1334 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1335 mode == NEWMV ? -1 : sb);
1336 // FIXME maybe move this code into find_ref_mvs()
1337 if ((mode == NEWMV || sb == -1) &&
1338 !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1352 if (mode == NEWMV) {
1353 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1354 s->prob.p.mv_joint);
1356 s->counts.mv_joint[j]++;
1357 if (j >= MV_JOINT_V)
1358 mv[0].y += read_mv_component(s, 0, hp);
1360 mv[0].x += read_mv_component(s, 1, hp);
1364 // FIXME cache this value and reuse for other subblocks
1365 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1366 mode == NEWMV ? -1 : sb);
1367 if ((mode == NEWMV || sb == -1) &&
1368 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1382 if (mode == NEWMV) {
1383 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1384 s->prob.p.mv_joint);
1386 s->counts.mv_joint[j]++;
1387 if (j >= MV_JOINT_V)
1388 mv[1].y += read_mv_component(s, 0, hp);
1390 mv[1].x += read_mv_component(s, 1, hp);
1396 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1397 ptrdiff_t stride, int v)
1407 int v16 = v * 0x0101;
1415 uint32_t v32 = v * 0x01010101;
1424 uint64_t v64 = v * 0x0101010101010101ULL;
1430 uint32_t v32 = v * 0x01010101;
1433 AV_WN32A(ptr + 4, v32);
1442 static void decode_mode(AVCodecContext *ctx)
1444 static const uint8_t left_ctx[N_BS_SIZES] = {
1445 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1447 static const uint8_t above_ctx[N_BS_SIZES] = {
1448 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1450 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1451 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1452 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1454 VP9Context *s = ctx->priv_data;
1456 int row = s->row, col = s->col, row7 = s->row7;
1457 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1458 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1459 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1460 int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1461 int vref, filter_id;
1463 if (!s->segmentation.enabled) {
1465 } else if (s->keyframe || s->intraonly) {
1466 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1467 } else if (!s->segmentation.update_map ||
1468 (s->segmentation.temporal &&
1469 vp56_rac_get_prob_branchy(&s->c,
1470 s->prob.segpred[s->above_segpred_ctx[col] +
1471 s->left_segpred_ctx[row7]]))) {
1472 if (!s->errorres && !s->segmentation.ignore_refmap) {
1474 uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
1476 if (!s->frames[REF_FRAME_SEGMAP].uses_2pass)
1477 ff_thread_await_progress(&s->frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1478 for (y = 0; y < h4; y++) {
1479 int idx_base = (y + row) * 8 * s->sb_cols + col;
1480 for (x = 0; x < w4; x++)
1481 pred = FFMIN(pred, refsegmap[idx_base + x]);
1483 av_assert1(pred < 8);
1489 memset(&s->above_segpred_ctx[col], 1, w4);
1490 memset(&s->left_segpred_ctx[row7], 1, h4);
1492 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1495 memset(&s->above_segpred_ctx[col], 0, w4);
1496 memset(&s->left_segpred_ctx[row7], 0, h4);
1498 if (s->segmentation.enabled &&
1499 (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1500 setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1501 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1504 b->skip = s->segmentation.enabled &&
1505 s->segmentation.feat[b->seg_id].skip_enabled;
1507 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1508 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1509 s->counts.skip[c][b->skip]++;
1512 if (s->keyframe || s->intraonly) {
1514 } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1515 b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1519 if (have_a && have_l) {
1520 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1523 c = have_a ? 2 * s->above_intra_ctx[col] :
1524 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1526 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1527 s->counts.intra[c][bit]++;
1531 if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1535 c = (s->above_skip_ctx[col] ? max_tx :
1536 s->above_txfm_ctx[col]) +
1537 (s->left_skip_ctx[row7] ? max_tx :
1538 s->left_txfm_ctx[row7]) > max_tx;
1540 c = s->above_skip_ctx[col] ? 1 :
1541 (s->above_txfm_ctx[col] * 2 > max_tx);
1543 } else if (have_l) {
1544 c = s->left_skip_ctx[row7] ? 1 :
1545 (s->left_txfm_ctx[row7] * 2 > max_tx);
1551 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1553 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1555 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1557 s->counts.tx32p[c][b->tx]++;
1560 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1562 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1563 s->counts.tx16p[c][b->tx]++;
1566 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1567 s->counts.tx8p[c][b->tx]++;
1574 b->tx = FFMIN(max_tx, s->txfmmode);
1577 if (s->keyframe || s->intraonly) {
1578 uint8_t *a = &s->above_mode_ctx[col * 2];
1579 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1582 if (b->bs > BS_8x8) {
1583 // FIXME the memory storage intermediates here aren't really
1584 // necessary, they're just there to make the code slightly
1586 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1587 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1588 if (b->bs != BS_8x4) {
1589 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1590 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1591 l[0] = a[1] = b->mode[1];
1593 l[0] = a[1] = b->mode[1] = b->mode[0];
1595 if (b->bs != BS_4x8) {
1596 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1597 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1598 if (b->bs != BS_8x4) {
1599 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1600 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1601 l[1] = a[1] = b->mode[3];
1603 l[1] = a[1] = b->mode[3] = b->mode[2];
1606 b->mode[2] = b->mode[0];
1607 l[1] = a[1] = b->mode[3] = b->mode[1];
1610 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1611 vp9_default_kf_ymode_probs[*a][*l]);
1612 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1613 // FIXME this can probably be optimized
1614 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1615 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1617 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1618 vp9_default_kf_uvmode_probs[b->mode[3]]);
1619 } else if (b->intra) {
1621 if (b->bs > BS_8x8) {
1622 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1623 s->prob.p.y_mode[0]);
1624 s->counts.y_mode[0][b->mode[0]]++;
1625 if (b->bs != BS_8x4) {
1626 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1627 s->prob.p.y_mode[0]);
1628 s->counts.y_mode[0][b->mode[1]]++;
1630 b->mode[1] = b->mode[0];
1632 if (b->bs != BS_4x8) {
1633 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1634 s->prob.p.y_mode[0]);
1635 s->counts.y_mode[0][b->mode[2]]++;
1636 if (b->bs != BS_8x4) {
1637 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1638 s->prob.p.y_mode[0]);
1639 s->counts.y_mode[0][b->mode[3]]++;
1641 b->mode[3] = b->mode[2];
1644 b->mode[2] = b->mode[0];
1645 b->mode[3] = b->mode[1];
1648 static const uint8_t size_group[10] = {
1649 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1651 int sz = size_group[b->bs];
1653 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1654 s->prob.p.y_mode[sz]);
1655 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1656 s->counts.y_mode[sz][b->mode[3]]++;
1658 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1659 s->prob.p.uv_mode[b->mode[3]]);
1660 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1662 static const uint8_t inter_mode_ctx_lut[14][14] = {
1663 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1664 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1665 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1666 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1667 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1668 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1669 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1670 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1671 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1672 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1673 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1674 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1675 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1676 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1679 if (s->segmentation.feat[b->seg_id].ref_enabled) {
1680 av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1682 b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1684 // read comp_pred flag
1685 if (s->comppredmode != PRED_SWITCHABLE) {
1686 b->comp = s->comppredmode == PRED_COMPREF;
1690 // FIXME add intra as ref=0xff (or -1) to make these easier?
1693 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1695 } else if (s->above_comp_ctx[col]) {
1696 c = 2 + (s->left_intra_ctx[row7] ||
1697 s->left_ref_ctx[row7] == s->fixcompref);
1698 } else if (s->left_comp_ctx[row7]) {
1699 c = 2 + (s->above_intra_ctx[col] ||
1700 s->above_ref_ctx[col] == s->fixcompref);
1702 c = (!s->above_intra_ctx[col] &&
1703 s->above_ref_ctx[col] == s->fixcompref) ^
1704 (!s->left_intra_ctx[row7] &&
1705 s->left_ref_ctx[row & 7] == s->fixcompref);
1708 c = s->above_comp_ctx[col] ? 3 :
1709 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1711 } else if (have_l) {
1712 c = s->left_comp_ctx[row7] ? 3 :
1713 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1717 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1718 s->counts.comp[c][b->comp]++;
1721 // read actual references
1722 // FIXME probably cache a few variables here to prevent repetitive
1723 // memory accesses below
1724 if (b->comp) /* two references */ {
1725 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1727 b->ref[fix_idx] = s->fixcompref;
1728 // FIXME can this codeblob be replaced by some sort of LUT?
1731 if (s->above_intra_ctx[col]) {
1732 if (s->left_intra_ctx[row7]) {
1735 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1737 } else if (s->left_intra_ctx[row7]) {
1738 c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1740 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1742 if (refl == refa && refa == s->varcompref[1]) {
1744 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1745 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1746 (refl == s->fixcompref && refa == s->varcompref[0])) {
1749 c = (refa == refl) ? 3 : 1;
1751 } else if (!s->left_comp_ctx[row7]) {
1752 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1755 c = (refl == s->varcompref[1] &&
1756 refa != s->varcompref[1]) ? 2 : 4;
1758 } else if (!s->above_comp_ctx[col]) {
1759 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1762 c = (refa == s->varcompref[1] &&
1763 refl != s->varcompref[1]) ? 2 : 4;
1766 c = (refl == refa) ? 4 : 2;
1770 if (s->above_intra_ctx[col]) {
1772 } else if (s->above_comp_ctx[col]) {
1773 c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1775 c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1778 } else if (have_l) {
1779 if (s->left_intra_ctx[row7]) {
1781 } else if (s->left_comp_ctx[row7]) {
1782 c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1784 c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1789 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1790 b->ref[var_idx] = s->varcompref[bit];
1791 s->counts.comp_ref[c][bit]++;
1792 } else /* single reference */ {
1795 if (have_a && !s->above_intra_ctx[col]) {
1796 if (have_l && !s->left_intra_ctx[row7]) {
1797 if (s->left_comp_ctx[row7]) {
1798 if (s->above_comp_ctx[col]) {
1799 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1800 !s->above_ref_ctx[col]);
1802 c = (3 * !s->above_ref_ctx[col]) +
1803 (!s->fixcompref || !s->left_ref_ctx[row7]);
1805 } else if (s->above_comp_ctx[col]) {
1806 c = (3 * !s->left_ref_ctx[row7]) +
1807 (!s->fixcompref || !s->above_ref_ctx[col]);
1809 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1811 } else if (s->above_intra_ctx[col]) {
1813 } else if (s->above_comp_ctx[col]) {
1814 c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1816 c = 4 * (!s->above_ref_ctx[col]);
1818 } else if (have_l && !s->left_intra_ctx[row7]) {
1819 if (s->left_intra_ctx[row7]) {
1821 } else if (s->left_comp_ctx[row7]) {
1822 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1824 c = 4 * (!s->left_ref_ctx[row7]);
1829 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1830 s->counts.single_ref[c][0][bit]++;
1834 // FIXME can this codeblob be replaced by some sort of LUT?
1837 if (s->left_intra_ctx[row7]) {
1838 if (s->above_intra_ctx[col]) {
1840 } else if (s->above_comp_ctx[col]) {
1841 c = 1 + 2 * (s->fixcompref == 1 ||
1842 s->above_ref_ctx[col] == 1);
1843 } else if (!s->above_ref_ctx[col]) {
1846 c = 4 * (s->above_ref_ctx[col] == 1);
1848 } else if (s->above_intra_ctx[col]) {
1849 if (s->left_intra_ctx[row7]) {
1851 } else if (s->left_comp_ctx[row7]) {
1852 c = 1 + 2 * (s->fixcompref == 1 ||
1853 s->left_ref_ctx[row7] == 1);
1854 } else if (!s->left_ref_ctx[row7]) {
1857 c = 4 * (s->left_ref_ctx[row7] == 1);
1859 } else if (s->above_comp_ctx[col]) {
1860 if (s->left_comp_ctx[row7]) {
1861 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1862 c = 3 * (s->fixcompref == 1 ||
1863 s->left_ref_ctx[row7] == 1);
1867 } else if (!s->left_ref_ctx[row7]) {
1868 c = 1 + 2 * (s->fixcompref == 1 ||
1869 s->above_ref_ctx[col] == 1);
1871 c = 3 * (s->left_ref_ctx[row7] == 1) +
1872 (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1874 } else if (s->left_comp_ctx[row7]) {
1875 if (!s->above_ref_ctx[col]) {
1876 c = 1 + 2 * (s->fixcompref == 1 ||
1877 s->left_ref_ctx[row7] == 1);
1879 c = 3 * (s->above_ref_ctx[col] == 1) +
1880 (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1882 } else if (!s->above_ref_ctx[col]) {
1883 if (!s->left_ref_ctx[row7]) {
1886 c = 4 * (s->left_ref_ctx[row7] == 1);
1888 } else if (!s->left_ref_ctx[row7]) {
1889 c = 4 * (s->above_ref_ctx[col] == 1);
1891 c = 2 * (s->left_ref_ctx[row7] == 1) +
1892 2 * (s->above_ref_ctx[col] == 1);
1895 if (s->above_intra_ctx[col] ||
1896 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1898 } else if (s->above_comp_ctx[col]) {
1899 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1901 c = 4 * (s->above_ref_ctx[col] == 1);
1904 } else if (have_l) {
1905 if (s->left_intra_ctx[row7] ||
1906 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1908 } else if (s->left_comp_ctx[row7]) {
1909 c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1911 c = 4 * (s->left_ref_ctx[row7] == 1);
1916 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1917 s->counts.single_ref[c][1][bit]++;
1918 b->ref[0] = 1 + bit;
1923 if (b->bs <= BS_8x8) {
1924 if (s->segmentation.feat[b->seg_id].skip_enabled) {
1925 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1927 static const uint8_t off[10] = {
1928 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1931 // FIXME this needs to use the LUT tables from find_ref_mvs
1932 // because not all are -1,0/0,-1
1933 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1934 [s->left_mode_ctx[row7 + off[b->bs]]];
1936 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1937 s->prob.p.mv_mode[c]);
1938 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1939 s->counts.mv_mode[c][b->mode[0] - 10]++;
1943 if (s->filtermode == FILTER_SWITCHABLE) {
1946 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1947 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1948 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1949 s->left_filter_ctx[row7] : 3;
1951 c = s->above_filter_ctx[col];
1953 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1954 c = s->left_filter_ctx[row7];
1959 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1960 s->prob.p.filter[c]);
1961 s->counts.filter[c][filter_id]++;
1962 b->filter = vp9_filter_lut[filter_id];
1964 b->filter = s->filtermode;
1967 if (b->bs > BS_8x8) {
1968 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1970 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1971 s->prob.p.mv_mode[c]);
1972 s->counts.mv_mode[c][b->mode[0] - 10]++;
1973 fill_mv(s, b->mv[0], b->mode[0], 0);
1975 if (b->bs != BS_8x4) {
1976 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1977 s->prob.p.mv_mode[c]);
1978 s->counts.mv_mode[c][b->mode[1] - 10]++;
1979 fill_mv(s, b->mv[1], b->mode[1], 1);
1981 b->mode[1] = b->mode[0];
1982 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1983 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1986 if (b->bs != BS_4x8) {
1987 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1988 s->prob.p.mv_mode[c]);
1989 s->counts.mv_mode[c][b->mode[2] - 10]++;
1990 fill_mv(s, b->mv[2], b->mode[2], 2);
1992 if (b->bs != BS_8x4) {
1993 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1994 s->prob.p.mv_mode[c]);
1995 s->counts.mv_mode[c][b->mode[3] - 10]++;
1996 fill_mv(s, b->mv[3], b->mode[3], 3);
1998 b->mode[3] = b->mode[2];
1999 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
2000 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
2003 b->mode[2] = b->mode[0];
2004 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2005 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2006 b->mode[3] = b->mode[1];
2007 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2008 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2011 fill_mv(s, b->mv[0], b->mode[0], -1);
2012 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2013 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2014 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2015 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2016 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2017 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2020 vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
2024 #define SPLAT_CTX(var, val, n) \
2026 case 1: var = val; break; \
2027 case 2: AV_WN16A(&var, val * 0x0101); break; \
2028 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2029 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2031 uint64_t v64 = val * 0x0101010101010101ULL; \
2032 AV_WN64A( &var, v64); \
2033 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2038 #define SPLAT_CTX(var, val, n) \
2040 case 1: var = val; break; \
2041 case 2: AV_WN16A(&var, val * 0x0101); break; \
2042 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2044 uint32_t v32 = val * 0x01010101; \
2045 AV_WN32A( &var, v32); \
2046 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2050 uint32_t v32 = val * 0x01010101; \
2051 AV_WN32A( &var, v32); \
2052 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2053 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2054 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2060 switch (bwh_tab[1][b->bs][0]) {
2061 #define SET_CTXS(dir, off, n) \
2063 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2064 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2065 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2066 if (!s->keyframe && !s->intraonly) { \
2067 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2068 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2069 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2071 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2072 if (s->filtermode == FILTER_SWITCHABLE) { \
2073 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2078 case 1: SET_CTXS(above, col, 1); break;
2079 case 2: SET_CTXS(above, col, 2); break;
2080 case 4: SET_CTXS(above, col, 4); break;
2081 case 8: SET_CTXS(above, col, 8); break;
2083 switch (bwh_tab[1][b->bs][1]) {
2084 case 1: SET_CTXS(left, row7, 1); break;
2085 case 2: SET_CTXS(left, row7, 2); break;
2086 case 4: SET_CTXS(left, row7, 4); break;
2087 case 8: SET_CTXS(left, row7, 8); break;
2092 if (!s->keyframe && !s->intraonly) {
2093 if (b->bs > BS_8x8) {
2094 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2096 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2097 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2098 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2099 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2100 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2101 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2102 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2103 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2105 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2107 for (n = 0; n < w4 * 2; n++) {
2108 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2109 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2111 for (n = 0; n < h4 * 2; n++) {
2112 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2113 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2119 for (y = 0; y < h4; y++) {
2120 int x, o = (row + y) * s->sb_cols * 8 + col;
2121 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2124 for (x = 0; x < w4; x++) {
2128 } else if (b->comp) {
2129 for (x = 0; x < w4; x++) {
2130 mv[x].ref[0] = b->ref[0];
2131 mv[x].ref[1] = b->ref[1];
2132 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2133 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2136 for (x = 0; x < w4; x++) {
2137 mv[x].ref[0] = b->ref[0];
2139 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2145 // FIXME merge cnt/eob arguments?
2146 static av_always_inline int
2147 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2148 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2149 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2150 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2151 const int16_t *band_counts, const int16_t *qmul)
2153 int i = 0, band = 0, band_left = band_counts[band];
2154 uint8_t *tp = p[0][nnz];
2155 uint8_t cache[1024];
2160 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2161 eob[band][nnz][val]++;
2166 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2167 cnt[band][nnz][0]++;
2169 band_left = band_counts[++band];
2171 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2173 if (++i == n_coeffs)
2174 break; //invalid input; blocks should end with EOB
2179 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2180 cnt[band][nnz][1]++;
2184 // fill in p[3-10] (model fill) - only once per frame for each pos
2186 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2188 cnt[band][nnz][2]++;
2189 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2190 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2191 cache[rc] = val = 2;
2193 val = 3 + vp56_rac_get_prob(c, tp[5]);
2196 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2198 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2199 val = 5 + vp56_rac_get_prob(c, 159);
2201 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2202 val += vp56_rac_get_prob(c, 145);
2206 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2207 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2208 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2209 val += (vp56_rac_get_prob(c, 148) << 1);
2210 val += vp56_rac_get_prob(c, 140);
2212 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2213 val += (vp56_rac_get_prob(c, 155) << 2);
2214 val += (vp56_rac_get_prob(c, 140) << 1);
2215 val += vp56_rac_get_prob(c, 135);
2217 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2218 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2219 val += (vp56_rac_get_prob(c, 157) << 3);
2220 val += (vp56_rac_get_prob(c, 141) << 2);
2221 val += (vp56_rac_get_prob(c, 134) << 1);
2222 val += vp56_rac_get_prob(c, 130);
2225 if (!is8bitsperpixel) {
2227 val += vp56_rac_get_prob(c, 255) << 17;
2228 val += vp56_rac_get_prob(c, 255) << 16;
2230 val += (vp56_rac_get_prob(c, 255) << 15);
2231 val += (vp56_rac_get_prob(c, 255) << 14);
2233 val += (vp56_rac_get_prob(c, 254) << 13);
2234 val += (vp56_rac_get_prob(c, 254) << 12);
2235 val += (vp56_rac_get_prob(c, 254) << 11);
2236 val += (vp56_rac_get_prob(c, 252) << 10);
2237 val += (vp56_rac_get_prob(c, 249) << 9);
2238 val += (vp56_rac_get_prob(c, 243) << 8);
2239 val += (vp56_rac_get_prob(c, 230) << 7);
2240 val += (vp56_rac_get_prob(c, 196) << 6);
2241 val += (vp56_rac_get_prob(c, 177) << 5);
2242 val += (vp56_rac_get_prob(c, 153) << 4);
2243 val += (vp56_rac_get_prob(c, 140) << 3);
2244 val += (vp56_rac_get_prob(c, 133) << 2);
2245 val += (vp56_rac_get_prob(c, 130) << 1);
2246 val += vp56_rac_get_prob(c, 129);
2250 #define STORE_COEF(c, i, v) do { \
2251 if (is8bitsperpixel) { \
2254 AV_WN32A(&c[i * 2], v); \
2258 band_left = band_counts[++band];
2260 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2262 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2263 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2265 } while (++i < n_coeffs);
2270 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2271 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2272 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2273 const int16_t (*nb)[2], const int16_t *band_counts,
2274 const int16_t *qmul)
2276 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2277 nnz, scan, nb, band_counts, qmul);
2280 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2281 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2282 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2283 const int16_t (*nb)[2], const int16_t *band_counts,
2284 const int16_t *qmul)
2286 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2287 nnz, scan, nb, band_counts, qmul);
2290 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2291 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2292 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2293 const int16_t (*nb)[2], const int16_t *band_counts,
2294 const int16_t *qmul)
2296 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2297 nnz, scan, nb, band_counts, qmul);
2300 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2301 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2302 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2303 const int16_t (*nb)[2], const int16_t *band_counts,
2304 const int16_t *qmul)
2306 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2307 nnz, scan, nb, band_counts, qmul);
2310 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2312 VP9Context *s = ctx->priv_data;
2314 int row = s->row, col = s->col;
2315 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2316 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2317 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2318 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2319 int end_x = FFMIN(2 * (s->cols - col), w4);
2320 int end_y = FFMIN(2 * (s->rows - row), h4);
2321 int n, pl, x, y, res;
2322 int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2323 int tx = 4 * s->lossless + b->tx;
2324 const int16_t * const *yscans = vp9_scans[tx];
2325 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2326 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2327 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2328 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2329 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2330 static const int16_t band_counts[4][8] = {
2331 { 1, 2, 3, 4, 3, 16 - 13 },
2332 { 1, 2, 3, 4, 11, 64 - 21 },
2333 { 1, 2, 3, 4, 11, 256 - 21 },
2334 { 1, 2, 3, 4, 11, 1024 - 21 },
2336 const int16_t *y_band_counts = band_counts[b->tx];
2337 const int16_t *uv_band_counts = band_counts[b->uvtx];
2338 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2339 int total_coeff = 0;
2341 #define MERGE(la, end, step, rd) \
2342 for (n = 0; n < end; n += step) \
2343 la[n] = !!rd(&la[n])
2344 #define MERGE_CTX(step, rd) \
2346 MERGE(l, end_y, step, rd); \
2347 MERGE(a, end_x, step, rd); \
2350 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2351 for (n = 0, y = 0; y < end_y; y += step) { \
2352 for (x = 0; x < end_x; x += step, n += step * step) { \
2353 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2354 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2355 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2356 c, e, p, a[x] + l[y], yscans[txtp], \
2357 ynbs[txtp], y_band_counts, qmul[0]); \
2358 a[x] = l[y] = !!res; \
2359 total_coeff |= !!res; \
2361 AV_WN16A(&s->eob[n], res); \
2368 #define SPLAT(la, end, step, cond) \
2370 for (n = 1; n < end; n += step) \
2371 la[n] = la[n - 1]; \
2372 } else if (step == 4) { \
2374 for (n = 0; n < end; n += step) \
2375 AV_WN32A(&la[n], la[n] * 0x01010101); \
2377 for (n = 0; n < end; n += step) \
2378 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2380 } else /* step == 8 */ { \
2382 if (HAVE_FAST_64BIT) { \
2383 for (n = 0; n < end; n += step) \
2384 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2386 for (n = 0; n < end; n += step) { \
2387 uint32_t v32 = la[n] * 0x01010101; \
2388 AV_WN32A(&la[n], v32); \
2389 AV_WN32A(&la[n + 4], v32); \
2393 for (n = 0; n < end; n += step) \
2394 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2397 #define SPLAT_CTX(step) \
2399 SPLAT(a, end_x, step, end_x == w4); \
2400 SPLAT(l, end_y, step, end_y == h4); \
2406 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2409 MERGE_CTX(2, AV_RN16A);
2410 DECODE_Y_COEF_LOOP(2, 0,);
2414 MERGE_CTX(4, AV_RN32A);
2415 DECODE_Y_COEF_LOOP(4, 0,);
2419 MERGE_CTX(8, AV_RN64A);
2420 DECODE_Y_COEF_LOOP(8, 0, 32);
2425 #define DECODE_UV_COEF_LOOP(step, v) \
2426 for (n = 0, y = 0; y < end_y; y += step) { \
2427 for (x = 0; x < end_x; x += step, n += step * step) { \
2428 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2429 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2430 16 * step * step, c, e, p, a[x] + l[y], \
2431 uvscan, uvnb, uv_band_counts, qmul[1]); \
2432 a[x] = l[y] = !!res; \
2433 total_coeff |= !!res; \
2435 AV_WN16A(&s->uveob[pl][n], res); \
2437 s->uveob[pl][n] = res; \
2442 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2443 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2444 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2449 for (pl = 0; pl < 2; pl++) {
2450 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2451 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2454 DECODE_UV_COEF_LOOP(1,);
2457 MERGE_CTX(2, AV_RN16A);
2458 DECODE_UV_COEF_LOOP(2,);
2462 MERGE_CTX(4, AV_RN32A);
2463 DECODE_UV_COEF_LOOP(4,);
2467 MERGE_CTX(8, AV_RN64A);
2468 DECODE_UV_COEF_LOOP(8, 32);
2477 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2479 return decode_coeffs(ctx, 1);
2482 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2484 return decode_coeffs(ctx, 0);
2487 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2488 uint8_t *dst_edge, ptrdiff_t stride_edge,
2489 uint8_t *dst_inner, ptrdiff_t stride_inner,
2490 uint8_t *l, int col, int x, int w,
2491 int row, int y, enum TxfmMode tx,
2492 int p, int ss_h, int ss_v, int bytesperpixel)
2494 int have_top = row > 0 || y > 0;
2495 int have_left = col > s->tiling.tile_col_start || x > 0;
2496 int have_right = x < w - 1;
2498 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2499 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2500 { DC_127_PRED, VERT_PRED } },
2501 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2502 { HOR_PRED, HOR_PRED } },
2503 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2504 { LEFT_DC_PRED, DC_PRED } },
2505 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2506 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2507 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2508 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2509 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2510 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2511 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2512 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2513 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2514 { DC_127_PRED, VERT_LEFT_PRED } },
2515 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2516 { HOR_UP_PRED, HOR_UP_PRED } },
2517 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2518 { HOR_PRED, TM_VP8_PRED } },
2520 static const struct {
2521 uint8_t needs_left:1;
2522 uint8_t needs_top:1;
2523 uint8_t needs_topleft:1;
2524 uint8_t needs_topright:1;
2525 uint8_t invert_left:1;
2526 } edges[N_INTRA_PRED_MODES] = {
2527 [VERT_PRED] = { .needs_top = 1 },
2528 [HOR_PRED] = { .needs_left = 1 },
2529 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2530 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2531 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2532 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2533 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2534 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2535 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2536 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2537 [LEFT_DC_PRED] = { .needs_left = 1 },
2538 [TOP_DC_PRED] = { .needs_top = 1 },
2539 [DC_128_PRED] = { 0 },
2540 [DC_127_PRED] = { 0 },
2541 [DC_129_PRED] = { 0 }
2544 av_assert2(mode >= 0 && mode < 10);
2545 mode = mode_conv[mode][have_left][have_top];
2546 if (edges[mode].needs_top) {
2547 uint8_t *top, *topleft;
2548 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2549 int n_px_need_tr = 0;
2551 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2554 // if top of sb64-row, use s->intra_pred_data[] instead of
2555 // dst[-stride] for intra prediction (it contains pre- instead of
2556 // post-loopfilter data)
2558 top = !(row & 7) && !y ?
2559 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2560 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2562 topleft = !(row & 7) && !y ?
2563 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2564 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2565 &dst_inner[-stride_inner];
2569 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2570 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2571 n_px_need + n_px_need_tr <= n_px_have) {
2575 if (n_px_need <= n_px_have) {
2576 memcpy(*a, top, n_px_need * bytesperpixel);
2578 #define memset_bpp(c, i1, v, i2, num) do { \
2579 if (bytesperpixel == 1) { \
2580 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2582 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2583 for (n = 0; n < (num); n++) { \
2584 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2588 memcpy(*a, top, n_px_have * bytesperpixel);
2589 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2592 #define memset_val(c, val, num) do { \
2593 if (bytesperpixel == 1) { \
2594 memset((c), (val), (num)); \
2597 for (n = 0; n < (num); n++) { \
2598 AV_WN16A(&(c)[n * 2], (val)); \
2602 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2604 if (edges[mode].needs_topleft) {
2605 if (have_left && have_top) {
2606 #define assign_bpp(c, i1, v, i2) do { \
2607 if (bytesperpixel == 1) { \
2608 (c)[(i1)] = (v)[(i2)]; \
2610 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2613 assign_bpp(*a, -1, topleft, -1);
2615 #define assign_val(c, i, v) do { \
2616 if (bytesperpixel == 1) { \
2619 AV_WN16A(&(c)[(i) * 2], (v)); \
2622 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2625 if (tx == TX_4X4 && edges[mode].needs_topright) {
2626 if (have_top && have_right &&
2627 n_px_need + n_px_need_tr <= n_px_have) {
2628 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2630 memset_bpp(*a, 4, *a, 3, 4);
2635 if (edges[mode].needs_left) {
2637 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2638 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2639 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2641 if (edges[mode].invert_left) {
2642 if (n_px_need <= n_px_have) {
2643 for (i = 0; i < n_px_need; i++)
2644 assign_bpp(l, i, &dst[i * stride], -1);
2646 for (i = 0; i < n_px_have; i++)
2647 assign_bpp(l, i, &dst[i * stride], -1);
2648 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2651 if (n_px_need <= n_px_have) {
2652 for (i = 0; i < n_px_need; i++)
2653 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2655 for (i = 0; i < n_px_have; i++)
2656 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2657 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2661 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2668 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2669 ptrdiff_t uv_off, int bytesperpixel)
2671 VP9Context *s = ctx->priv_data;
2673 int row = s->row, col = s->col;
2674 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2675 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2676 int end_x = FFMIN(2 * (s->cols - col), w4);
2677 int end_y = FFMIN(2 * (s->rows - row), h4);
2678 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2679 int uvstep1d = 1 << b->uvtx, p;
2680 uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2681 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2682 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2684 for (n = 0, y = 0; y < end_y; y += step1d) {
2685 uint8_t *ptr = dst, *ptr_r = dst_r;
2686 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2687 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2688 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2690 uint8_t *a = &a_buf[32];
2691 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2692 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2694 mode = check_intra_mode(s, mode, &a, ptr_r,
2695 s->frames[CUR_FRAME].tf.f->linesize[0],
2696 ptr, s->y_stride, l,
2697 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2698 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2700 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2701 s->block + 16 * n * bytesperpixel, eob);
2703 dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2704 dst += 4 * step1d * s->y_stride;
2711 step = 1 << (b->uvtx * 2);
2712 for (p = 0; p < 2; p++) {
2713 dst = s->dst[1 + p];
2714 dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2715 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2716 uint8_t *ptr = dst, *ptr_r = dst_r;
2717 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2718 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2719 int mode = b->uvmode;
2720 uint8_t *a = &a_buf[32];
2721 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2723 mode = check_intra_mode(s, mode, &a, ptr_r,
2724 s->frames[CUR_FRAME].tf.f->linesize[1],
2725 ptr, s->uv_stride, l, col, x, w4, row, y,
2726 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2727 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2729 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2730 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2732 dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2733 dst += 4 * uvstep1d * s->uv_stride;
2738 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2740 intra_recon(ctx, y_off, uv_off, 1);
2743 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2745 intra_recon(ctx, y_off, uv_off, 2);
2748 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2749 uint8_t *dst, ptrdiff_t dst_stride,
2750 const uint8_t *ref, ptrdiff_t ref_stride,
2751 ThreadFrame *ref_frame,
2752 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2753 int bw, int bh, int w, int h, int bytesperpixel,
2754 const uint16_t *scale, const uint8_t *step)
2756 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2758 int refbw_m1, refbh_m1;
2762 mv.x = av_clip(in_mv->x, -(x + bw + 4) << 3, (s->cols * 8 - x + 3) << 3);
2763 mv.y = av_clip(in_mv->y, -(y + bh + 4) << 3, (s->rows * 8 - y + 3) << 3);
2764 // BUG libvpx seems to scale the two components separately. This introduces
2765 // rounding errors but we have to reproduce them to be exactly compatible
2766 // with the output from libvpx...
2767 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2768 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2772 ref += y * ref_stride + x * bytesperpixel;
2775 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2776 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2777 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2778 // we use +7 because the last 7 pixels of each sbrow can be changed in
2779 // the longest loopfilter of the next sbrow
2780 th = (y + refbh_m1 + 4 + 7) >> 6;
2781 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2782 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2783 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2784 ref - 3 * ref_stride - 3 * bytesperpixel,
2786 refbw_m1 + 8, refbh_m1 + 8,
2787 x - 3, y - 3, w, h);
2788 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2791 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2794 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2795 uint8_t *dst_u, uint8_t *dst_v,
2796 ptrdiff_t dst_stride,
2797 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2798 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2799 ThreadFrame *ref_frame,
2800 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2801 int bw, int bh, int w, int h, int bytesperpixel,
2802 const uint16_t *scale, const uint8_t *step)
2805 int refbw_m1, refbh_m1;
2810 // BUG https://code.google.com/p/webm/issues/detail?id=820
2811 mv.x = av_clip(in_mv->x, -(x + bw + 4) << 4, (s->cols * 4 - x + 3) << 4);
2812 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2814 mv.x = av_clip(in_mv->x, -(x + bw + 4) << 3, (s->cols * 8 - x + 3) << 3);
2815 mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2818 // BUG https://code.google.com/p/webm/issues/detail?id=820
2819 mv.y = av_clip(in_mv->y, -(y + bh + 4) << 4, (s->rows * 4 - y + 3) << 4);
2820 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2822 mv.y = av_clip(in_mv->y, -(y + bh + 4) << 3, (s->rows * 8 - y + 3) << 3);
2823 my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2828 ref_u += y * src_stride_u + x * bytesperpixel;
2829 ref_v += y * src_stride_v + x * bytesperpixel;
2832 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2833 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2834 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2835 // we use +7 because the last 7 pixels of each sbrow can be changed in
2836 // the longest loopfilter of the next sbrow
2837 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2838 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2839 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2840 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2841 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2843 refbw_m1 + 8, refbh_m1 + 8,
2844 x - 3, y - 3, w, h);
2845 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2846 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2848 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2849 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2851 refbw_m1 + 8, refbh_m1 + 8,
2852 x - 3, y - 3, w, h);
2853 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2854 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2856 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2857 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2861 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
2862 mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
2863 mv, bw, bh, w, h, bytesperpixel, \
2864 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2865 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2866 row, col, mv, bw, bh, w, h, i) \
2867 mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2868 row, col, mv, bw, bh, w, h, bytesperpixel, \
2869 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2871 #define FN(x) x##_scaled_8bpp
2872 #define BYTES_PER_PIXEL 1
2873 #include "vp9_mc_template.c"
2875 #undef BYTES_PER_PIXEL
2876 #define FN(x) x##_scaled_16bpp
2877 #define BYTES_PER_PIXEL 2
2878 #include "vp9_mc_template.c"
2880 #undef mc_chroma_dir
2882 #undef BYTES_PER_PIXEL
2885 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2886 uint8_t *dst, ptrdiff_t dst_stride,
2887 const uint8_t *ref, ptrdiff_t ref_stride,
2888 ThreadFrame *ref_frame,
2889 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2890 int bw, int bh, int w, int h, int bytesperpixel)
2892 int mx = mv->x, my = mv->y, th;
2896 ref += y * ref_stride + x * bytesperpixel;
2899 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2900 // we use +7 because the last 7 pixels of each sbrow can be changed in
2901 // the longest loopfilter of the next sbrow
2902 th = (y + bh + 4 * !!my + 7) >> 6;
2903 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2904 if (x < !!mx * 3 || y < !!my * 3 ||
2905 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2906 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2907 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2909 bw + !!mx * 7, bh + !!my * 7,
2910 x - !!mx * 3, y - !!my * 3, w, h);
2911 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2914 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2917 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2918 uint8_t *dst_u, uint8_t *dst_v,
2919 ptrdiff_t dst_stride,
2920 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2921 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2922 ThreadFrame *ref_frame,
2923 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2924 int bw, int bh, int w, int h, int bytesperpixel)
2926 int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2930 ref_u += y * src_stride_u + x * bytesperpixel;
2931 ref_v += y * src_stride_v + x * bytesperpixel;
2934 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2935 // we use +7 because the last 7 pixels of each sbrow can be changed in
2936 // the longest loopfilter of the next sbrow
2937 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2938 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2939 if (x < !!mx * 3 || y < !!my * 3 ||
2940 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2941 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2942 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2944 bw + !!mx * 7, bh + !!my * 7,
2945 x - !!mx * 3, y - !!my * 3, w, h);
2946 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2947 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2949 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2950 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2952 bw + !!mx * 7, bh + !!my * 7,
2953 x - !!mx * 3, y - !!my * 3, w, h);
2954 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2955 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2957 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2958 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2962 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
2963 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2964 mv, bw, bh, w, h, bytesperpixel)
2965 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2966 row, col, mv, bw, bh, w, h, i) \
2967 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2968 row, col, mv, bw, bh, w, h, bytesperpixel)
2970 #define FN(x) x##_8bpp
2971 #define BYTES_PER_PIXEL 1
2972 #include "vp9_mc_template.c"
2974 #undef BYTES_PER_PIXEL
2975 #define FN(x) x##_16bpp
2976 #define BYTES_PER_PIXEL 2
2977 #include "vp9_mc_template.c"
2978 #undef mc_luma_dir_dir
2979 #undef mc_chroma_dir_dir
2981 #undef BYTES_PER_PIXEL
2984 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2986 VP9Context *s = ctx->priv_data;
2988 int row = s->row, col = s->col;
2990 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2991 if (bytesperpixel == 1) {
2992 inter_pred_scaled_8bpp(ctx);
2994 inter_pred_scaled_16bpp(ctx);
2997 if (bytesperpixel == 1) {
2998 inter_pred_8bpp(ctx);
3000 inter_pred_16bpp(ctx);
3004 /* mostly copied intra_recon() */
3006 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3007 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3008 int end_x = FFMIN(2 * (s->cols - col), w4);
3009 int end_y = FFMIN(2 * (s->rows - row), h4);
3010 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
3011 int uvstep1d = 1 << b->uvtx, p;
3012 uint8_t *dst = s->dst[0];
3015 for (n = 0, y = 0; y < end_y; y += step1d) {
3017 for (x = 0; x < end_x; x += step1d,
3018 ptr += 4 * step1d * bytesperpixel, n += step) {
3019 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3022 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3023 s->block + 16 * n * bytesperpixel, eob);
3025 dst += 4 * s->y_stride * step1d;
3031 step = 1 << (b->uvtx * 2);
3032 for (p = 0; p < 2; p++) {
3033 dst = s->dst[p + 1];
3034 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3036 for (x = 0; x < end_x; x += uvstep1d,
3037 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3038 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3041 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3042 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3044 dst += 4 * uvstep1d * s->uv_stride;
3050 static void inter_recon_8bpp(AVCodecContext *ctx)
3052 inter_recon(ctx, 1);
3055 static void inter_recon_16bpp(AVCodecContext *ctx)
3057 inter_recon(ctx, 2);
3060 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3061 int row_and_7, int col_and_7,
3062 int w, int h, int col_end, int row_end,
3063 enum TxfmMode tx, int skip_inter)
3065 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3066 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3068 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3069 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3070 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3071 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3073 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3074 // edges. This means that for UV, we work on two subsampled blocks at
3075 // a time, and we only use the topleft block's mode information to set
3076 // things like block strength. Thus, for any block size smaller than
3077 // 16x16, ignore the odd portion of the block.
3078 if (tx == TX_4X4 && (ss_v | ss_h)) {
3093 if (tx == TX_4X4 && !skip_inter) {
3094 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3095 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3096 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3098 for (y = row_and_7; y < h + row_and_7; y++) {
3099 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3101 mask[0][y][1] |= m_row_8;
3102 mask[0][y][2] |= m_row_4;
3103 // for odd lines, if the odd col is not being filtered,
3104 // skip odd row also:
3111 // if a/c are even row/col and b/d are odd, and d is skipped,
3112 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3113 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3114 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3116 mask[1][y][col_mask_id] |= m_col;
3119 mask[0][y][3] |= m_col;
3121 if (ss_h && (col_end & 1))
3122 mask[1][y][3] |= (t << (w - 1)) - t;
3124 mask[1][y][3] |= m_col;
3128 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3131 int mask_id = (tx == TX_8X8);
3132 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3133 int l2 = tx + ss_h - 1, step1d;
3134 int m_row = m_col & masks[l2];
3136 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3137 // 8wd loopfilter to prevent going off the visible edge.
3138 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3139 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3140 int m_row_8 = m_row - m_row_16;
3142 for (y = row_and_7; y < h + row_and_7; y++) {
3143 mask[0][y][0] |= m_row_16;
3144 mask[0][y][1] |= m_row_8;
3147 for (y = row_and_7; y < h + row_and_7; y++)
3148 mask[0][y][mask_id] |= m_row;
3153 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3154 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3155 mask[1][y][0] |= m_col;
3156 if (y - row_and_7 == h - 1)
3157 mask[1][y][1] |= m_col;
3159 for (y = row_and_7; y < h + row_and_7; y += step1d)
3160 mask[1][y][mask_id] |= m_col;
3162 } else if (tx != TX_4X4) {
3165 mask_id = (tx == TX_8X8) || (h == ss_v);
3166 mask[1][row_and_7][mask_id] |= m_col;
3167 mask_id = (tx == TX_8X8) || (w == ss_h);
3168 for (y = row_and_7; y < h + row_and_7; y++)
3169 mask[0][y][mask_id] |= t;
3171 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3173 for (y = row_and_7; y < h + row_and_7; y++) {
3174 mask[0][y][2] |= t4;
3175 mask[0][y][1] |= t8;
3177 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3182 static void decode_b(AVCodecContext *ctx, int row, int col,
3183 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3184 enum BlockLevel bl, enum BlockPartition bp)
3186 VP9Context *s = ctx->priv_data;
3188 enum BlockSize bs = bl * 3 + bp;
3189 int bytesperpixel = s->bytesperpixel;
3190 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3192 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3198 s->min_mv.x = -(128 + col * 64);
3199 s->min_mv.y = -(128 + row * 64);
3200 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3201 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3207 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3208 (s->ss_v && h4 * 2 == (1 << b->tx)));
3213 if (bytesperpixel == 1) {
3214 has_coeffs = decode_coeffs_8bpp(ctx);
3216 has_coeffs = decode_coeffs_16bpp(ctx);
3218 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3220 memset(&s->above_skip_ctx[col], 1, w4);
3221 memset(&s->left_skip_ctx[s->row7], 1, h4);
3226 #define SPLAT_ZERO_CTX(v, n) \
3228 case 1: v = 0; break; \
3229 case 2: AV_ZERO16(&v); break; \
3230 case 4: AV_ZERO32(&v); break; \
3231 case 8: AV_ZERO64(&v); break; \
3232 case 16: AV_ZERO128(&v); break; \
3234 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3236 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3237 if (s->ss_##dir2) { \
3238 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3239 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3241 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3242 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3247 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3248 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3249 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3250 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3253 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3254 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3255 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3256 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3261 s->block += w4 * h4 * 64 * bytesperpixel;
3262 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3263 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3264 s->eob += 4 * w4 * h4;
3265 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3266 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3272 // emulated overhangs if the stride of the target buffer can't hold. This
3273 // allows to support emu-edge and so on even if we have large block
3275 emu[0] = (col + w4) * 8 > f->linesize[0] ||
3276 (row + h4) > s->rows;
3277 emu[1] = (col + w4) * 4 > f->linesize[1] ||
3278 (row + h4) > s->rows;
3280 s->dst[0] = s->tmp_y;
3283 s->dst[0] = f->data[0] + yoff;
3284 s->y_stride = f->linesize[0];
3287 s->dst[1] = s->tmp_uv[0];
3288 s->dst[2] = s->tmp_uv[1];
3291 s->dst[1] = f->data[1] + uvoff;
3292 s->dst[2] = f->data[2] + uvoff;
3293 s->uv_stride = f->linesize[1];
3297 intra_recon_16bpp(ctx, yoff, uvoff);
3299 intra_recon_8bpp(ctx, yoff, uvoff);
3303 inter_recon_16bpp(ctx);
3305 inter_recon_8bpp(ctx);
3309 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3311 for (n = 0; o < w; n++) {
3316 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3317 s->tmp_y + o, 128, h, 0, 0);
3318 o += bw * bytesperpixel;
3323 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3324 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3326 for (n = s->ss_h; o < w; n++) {
3331 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3332 s->tmp_uv[0] + o, 128, h, 0, 0);
3333 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3334 s->tmp_uv[1] + o, 128, h, 0, 0);
3335 o += bw * bytesperpixel;
3340 // pick filter level and find edges to apply filter to
3341 if (s->filter.level &&
3342 (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3343 [b->mode[3] != ZEROMV]) > 0) {
3344 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3345 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3347 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3348 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3349 if (s->ss_h || s->ss_v)
3350 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3351 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3352 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3353 b->uvtx, skip_inter);
3355 if (!s->filter.lim_lut[lvl]) {
3356 int sharp = s->filter.sharpness;
3360 limit >>= (sharp + 3) >> 2;
3361 limit = FFMIN(limit, 9 - sharp);
3363 limit = FFMAX(limit, 1);
3365 s->filter.lim_lut[lvl] = limit;
3366 s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3372 s->block += w4 * h4 * 64 * bytesperpixel;
3373 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3374 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3375 s->eob += 4 * w4 * h4;
3376 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3377 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3381 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3382 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3384 VP9Context *s = ctx->priv_data;
3385 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3386 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3387 const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
3388 s->prob.p.partition[bl][c];
3389 enum BlockPartition bp;
3390 ptrdiff_t hbs = 4 >> bl;
3391 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3392 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3393 int bytesperpixel = s->bytesperpixel;
3396 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3397 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3398 } else if (col + hbs < s->cols) { // FIXME why not <=?
3399 if (row + hbs < s->rows) { // FIXME why not <=?
3400 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3402 case PARTITION_NONE:
3403 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3406 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3407 yoff += hbs * 8 * y_stride;
3408 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3409 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3412 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3413 yoff += hbs * 8 * bytesperpixel;
3414 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3415 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3417 case PARTITION_SPLIT:
3418 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3419 decode_sb(ctx, row, col + hbs, lflvl,
3420 yoff + 8 * hbs * bytesperpixel,
3421 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3422 yoff += hbs * 8 * y_stride;
3423 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3424 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3425 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3426 yoff + 8 * hbs * bytesperpixel,
3427 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3432 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3433 bp = PARTITION_SPLIT;
3434 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3435 decode_sb(ctx, row, col + hbs, lflvl,
3436 yoff + 8 * hbs * bytesperpixel,
3437 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3440 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3442 } else if (row + hbs < s->rows) { // FIXME why not <=?
3443 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3444 bp = PARTITION_SPLIT;
3445 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3446 yoff += hbs * 8 * y_stride;
3447 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3448 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3451 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3454 bp = PARTITION_SPLIT;
3455 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3457 s->counts.partition[bl][c][bp]++;
3460 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3461 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3463 VP9Context *s = ctx->priv_data;
3465 ptrdiff_t hbs = 4 >> bl;
3466 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3467 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3468 int bytesperpixel = s->bytesperpixel;
3471 av_assert2(b->bl == BL_8X8);
3472 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3473 } else if (s->b->bl == bl) {
3474 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3475 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3476 yoff += hbs * 8 * y_stride;
3477 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3478 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3479 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3480 yoff += hbs * 8 * bytesperpixel;
3481 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3482 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3485 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3486 if (col + hbs < s->cols) { // FIXME why not <=?
3487 if (row + hbs < s->rows) {
3488 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3489 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3490 yoff += hbs * 8 * y_stride;
3491 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3492 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3493 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3494 yoff + 8 * hbs * bytesperpixel,
3495 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3497 yoff += hbs * 8 * bytesperpixel;
3498 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3499 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3501 } else if (row + hbs < s->rows) {
3502 yoff += hbs * 8 * y_stride;
3503 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3504 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3509 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3510 uint8_t *lvl, uint8_t (*mask)[4],
3511 uint8_t *dst, ptrdiff_t ls)
3513 int y, x, bytesperpixel = s->bytesperpixel;
3515 // filter edges between columns (e.g. block1 | block2)
3516 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3517 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3518 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3519 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3520 unsigned hm = hm1 | hm2 | hm13 | hm23;
3522 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3525 int L = *l, H = L >> 4;
3526 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3528 if (hmask1[0] & x) {
3529 if (hmask2[0] & x) {
3530 av_assert2(l[8 << ss_v] == L);
3531 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3533 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3535 } else if (hm2 & x) {
3538 E |= s->filter.mblim_lut[L] << 8;
3539 I |= s->filter.lim_lut[L] << 8;
3540 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3542 [0](ptr, ls, E, I, H);
3544 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3545 [0](ptr, ls, E, I, H);
3547 } else if (hm2 & x) {
3548 int L = l[8 << ss_v], H = L >> 4;
3549 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3551 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3552 [0](ptr + 8 * ls, ls, E, I, H);
3560 int L = *l, H = L >> 4;
3561 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3566 E |= s->filter.mblim_lut[L] << 8;
3567 I |= s->filter.lim_lut[L] << 8;
3568 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3570 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3572 } else if (hm23 & x) {
3573 int L = l[8 << ss_v], H = L >> 4;
3574 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3576 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3584 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3585 uint8_t *lvl, uint8_t (*mask)[4],
3586 uint8_t *dst, ptrdiff_t ls)
3588 int y, x, bytesperpixel = s->bytesperpixel;
3591 // filter edges between rows (e.g. ------)
3593 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3594 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3595 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3597 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3600 int L = *l, H = L >> 4;
3601 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3604 if (vmask[0] & (x << (1 + ss_h))) {
3605 av_assert2(l[1 + ss_h] == L);
3606 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3608 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3610 } else if (vm & (x << (1 + ss_h))) {
3613 E |= s->filter.mblim_lut[L] << 8;
3614 I |= s->filter.lim_lut[L] << 8;
3615 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3616 [!!(vmask[1] & (x << (1 + ss_h)))]
3617 [1](ptr, ls, E, I, H);
3619 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3620 [1](ptr, ls, E, I, H);
3622 } else if (vm & (x << (1 + ss_h))) {
3623 int L = l[1 + ss_h], H = L >> 4;
3624 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3626 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3627 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3632 int L = *l, H = L >> 4;
3633 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3635 if (vm3 & (x << (1 + ss_h))) {
3638 E |= s->filter.mblim_lut[L] << 8;
3639 I |= s->filter.lim_lut[L] << 8;
3640 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3642 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3644 } else if (vm3 & (x << (1 + ss_h))) {
3645 int L = l[1 + ss_h], H = L >> 4;
3646 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3648 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3661 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3662 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3664 VP9Context *s = ctx->priv_data;
3665 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3666 uint8_t *dst = f->data[0] + yoff;
3667 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3668 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3671 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3672 // if you think of them as acting on a 8x8 block max, we can interleave
3673 // each v/h within the single x loop, but that only works if we work on
3674 // 8 pixel blocks, and we won't always do that (we want at least 16px
3675 // to use SSE2 optimizations, perhaps 32 for AVX2)
3677 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3678 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3680 for (p = 0; p < 2; p++) {
3681 dst = f->data[1 + p] + uvoff;
3682 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3683 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3687 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3689 int sb_start = ( idx * n) >> log2_n;
3690 int sb_end = ((idx + 1) * n) >> log2_n;
3691 *start = FFMIN(sb_start, n) << 3;
3692 *end = FFMIN(sb_end, n) << 3;
3695 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3696 int max_count, int update_factor)
3698 unsigned ct = ct0 + ct1, p2, p1;
3704 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3705 p2 = av_clip(p2, 1, 255);
3706 ct = FFMIN(ct, max_count);
3707 update_factor = FASTDIV(update_factor * ct, max_count);
3709 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3710 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3713 static void adapt_probs(VP9Context *s)
3716 prob_context *p = &s->prob_ctx[s->framectxid].p;
3717 int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3720 for (i = 0; i < 4; i++)
3721 for (j = 0; j < 2; j++)
3722 for (k = 0; k < 2; k++)
3723 for (l = 0; l < 6; l++)
3724 for (m = 0; m < 6; m++) {
3725 uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3726 unsigned *e = s->counts.eob[i][j][k][l][m];
3727 unsigned *c = s->counts.coef[i][j][k][l][m];
3729 if (l == 0 && m >= 3) // dc only has 3 pt
3732 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3733 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3734 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3737 if (s->keyframe || s->intraonly) {
3738 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3739 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3740 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3741 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3746 for (i = 0; i < 3; i++)
3747 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3750 for (i = 0; i < 4; i++)
3751 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3754 if (s->comppredmode == PRED_SWITCHABLE) {
3755 for (i = 0; i < 5; i++)
3756 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3760 if (s->comppredmode != PRED_SINGLEREF) {
3761 for (i = 0; i < 5; i++)
3762 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3763 s->counts.comp_ref[i][1], 20, 128);
3766 if (s->comppredmode != PRED_COMPREF) {
3767 for (i = 0; i < 5; i++) {
3768 uint8_t *pp = p->single_ref[i];
3769 unsigned (*c)[2] = s->counts.single_ref[i];
3771 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3772 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3776 // block partitioning
3777 for (i = 0; i < 4; i++)
3778 for (j = 0; j < 4; j++) {
3779 uint8_t *pp = p->partition[i][j];
3780 unsigned *c = s->counts.partition[i][j];
3782 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3783 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3784 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3788 if (s->txfmmode == TX_SWITCHABLE) {
3789 for (i = 0; i < 2; i++) {
3790 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3792 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3793 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3794 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3795 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3796 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3797 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3801 // interpolation filter
3802 if (s->filtermode == FILTER_SWITCHABLE) {
3803 for (i = 0; i < 4; i++) {
3804 uint8_t *pp = p->filter[i];
3805 unsigned *c = s->counts.filter[i];
3807 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3808 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3813 for (i = 0; i < 7; i++) {
3814 uint8_t *pp = p->mv_mode[i];
3815 unsigned *c = s->counts.mv_mode[i];
3817 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3818 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3819 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3824 uint8_t *pp = p->mv_joint;
3825 unsigned *c = s->counts.mv_joint;
3827 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3828 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3829 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3833 for (i = 0; i < 2; i++) {
3835 unsigned *c, (*c2)[2], sum;
3837 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3838 s->counts.mv_comp[i].sign[1], 20, 128);
3840 pp = p->mv_comp[i].classes;
3841 c = s->counts.mv_comp[i].classes;
3842 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3843 adapt_prob(&pp[0], c[0], sum, 20, 128);
3845 adapt_prob(&pp[1], c[1], sum, 20, 128);
3847 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3848 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3850 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3851 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3853 adapt_prob(&pp[6], c[6], sum, 20, 128);
3854 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3855 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3856 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3858 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3859 s->counts.mv_comp[i].class0[1], 20, 128);
3860 pp = p->mv_comp[i].bits;
3861 c2 = s->counts.mv_comp[i].bits;
3862 for (j = 0; j < 10; j++)
3863 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3865 for (j = 0; j < 2; j++) {
3866 pp = p->mv_comp[i].class0_fp[j];
3867 c = s->counts.mv_comp[i].class0_fp[j];
3868 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3869 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3870 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3872 pp = p->mv_comp[i].fp;
3873 c = s->counts.mv_comp[i].fp;
3874 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3875 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3876 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3878 if (s->highprecisionmvs) {
3879 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3880 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3881 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3882 s->counts.mv_comp[i].hp[1], 20, 128);
3887 for (i = 0; i < 4; i++) {
3888 uint8_t *pp = p->y_mode[i];
3889 unsigned *c = s->counts.y_mode[i], sum, s2;
3891 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3892 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3893 sum -= c[TM_VP8_PRED];
3894 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3895 sum -= c[VERT_PRED];
3896 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3897 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3899 adapt_prob(&pp[3], s2, sum, 20, 128);
3901 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3902 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3903 sum -= c[DIAG_DOWN_LEFT_PRED];
3904 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3905 sum -= c[VERT_LEFT_PRED];
3906 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3907 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3911 for (i = 0; i < 10; i++) {
3912 uint8_t *pp = p->uv_mode[i];
3913 unsigned *c = s->counts.uv_mode[i], sum, s2;
3915 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3916 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3917 sum -= c[TM_VP8_PRED];
3918 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3919 sum -= c[VERT_PRED];
3920 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3921 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3923 adapt_prob(&pp[3], s2, sum, 20, 128);
3925 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3926 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3927 sum -= c[DIAG_DOWN_LEFT_PRED];
3928 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3929 sum -= c[VERT_LEFT_PRED];
3930 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3931 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3935 static void free_buffers(VP9Context *s)
3937 av_freep(&s->intra_pred_data[0]);
3938 av_freep(&s->b_base);
3939 av_freep(&s->block_base);
3942 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3944 VP9Context *s = ctx->priv_data;
3947 for (i = 0; i < 3; i++) {
3948 if (s->frames[i].tf.f->data[0])
3949 vp9_unref_frame(ctx, &s->frames[i]);
3950 av_frame_free(&s->frames[i].tf.f);
3952 for (i = 0; i < 8; i++) {
3953 if (s->refs[i].f->data[0])
3954 ff_thread_release_buffer(ctx, &s->refs[i]);
3955 av_frame_free(&s->refs[i].f);
3956 if (s->next_refs[i].f->data[0])
3957 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3958 av_frame_free(&s->next_refs[i].f);
3968 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3969 int *got_frame, AVPacket *pkt)
3971 const uint8_t *data = pkt->data;
3972 int size = pkt->size;
3973 VP9Context *s = ctx->priv_data;
3974 int res, tile_row, tile_col, i, ref, row, col;
3975 int retain_segmap_ref = s->segmentation.enabled && !s->segmentation.update_map;
3976 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3980 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3982 } else if (res == 0) {
3983 if (!s->refs[ref].f->data[0]) {
3984 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3985 return AVERROR_INVALIDDATA;
3987 if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3989 ((AVFrame *)frame)->pkt_pts = pkt->pts;
3990 ((AVFrame *)frame)->pkt_dts = pkt->dts;
3991 for (i = 0; i < 8; i++) {
3992 if (s->next_refs[i].f->data[0])
3993 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3994 if (s->refs[i].f->data[0] &&
3995 (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
4004 if (!retain_segmap_ref) {
4005 if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0])
4006 vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
4007 if (!s->keyframe && !s->intraonly && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4008 (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_SEGMAP], &s->frames[CUR_FRAME])) < 0)
4011 if (s->frames[REF_FRAME_MVPAIR].tf.f->data[0])
4012 vp9_unref_frame(ctx, &s->frames[REF_FRAME_MVPAIR]);
4013 if (!s->intraonly && !s->keyframe && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4014 (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_MVPAIR], &s->frames[CUR_FRAME])) < 0)
4016 if (s->frames[CUR_FRAME].tf.f->data[0])
4017 vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
4018 if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
4020 f = s->frames[CUR_FRAME].tf.f;
4021 f->key_frame = s->keyframe;
4022 f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4023 ls_y = f->linesize[0];
4024 ls_uv =f->linesize[1];
4027 for (i = 0; i < 8; i++) {
4028 if (s->next_refs[i].f->data[0])
4029 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4030 if (s->refreshrefmask & (1 << i)) {
4031 res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
4032 } else if (s->refs[i].f->data[0]) {
4033 res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
4039 // main tile decode loop
4040 bytesperpixel = s->bytesperpixel;
4041 memset(s->above_partition_ctx, 0, s->cols);
4042 memset(s->above_skip_ctx, 0, s->cols);
4043 if (s->keyframe || s->intraonly) {
4044 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4046 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4048 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4049 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4050 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4051 memset(s->above_segpred_ctx, 0, s->cols);
4052 s->pass = s->frames[CUR_FRAME].uses_2pass =
4053 ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
4054 if ((res = update_block_buffers(ctx)) < 0) {
4055 av_log(ctx, AV_LOG_ERROR,
4056 "Failed to allocate block buffers\n");
4059 if (s->refreshctx && s->parallelmode) {
4062 for (i = 0; i < 4; i++) {
4063 for (j = 0; j < 2; j++)
4064 for (k = 0; k < 2; k++)
4065 for (l = 0; l < 6; l++)
4066 for (m = 0; m < 6; m++)
4067 memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
4068 s->prob.coef[i][j][k][l][m], 3);
4069 if (s->txfmmode == i)
4072 s->prob_ctx[s->framectxid].p = s->prob.p;
4073 ff_thread_finish_setup(ctx);
4074 } else if (!s->refreshctx) {
4075 ff_thread_finish_setup(ctx);
4081 s->block = s->block_base;
4082 s->uvblock[0] = s->uvblock_base[0];
4083 s->uvblock[1] = s->uvblock_base[1];
4084 s->eob = s->eob_base;
4085 s->uveob[0] = s->uveob_base[0];
4086 s->uveob[1] = s->uveob_base[1];
4088 for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
4089 set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
4090 tile_row, s->tiling.log2_tile_rows, s->sb_rows);
4092 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4095 if (tile_col == s->tiling.tile_cols - 1 &&
4096 tile_row == s->tiling.tile_rows - 1) {
4099 tile_size = AV_RB32(data);
4103 if (tile_size > size) {
4104 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4105 return AVERROR_INVALIDDATA;
4107 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4108 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4109 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4110 return AVERROR_INVALIDDATA;
4117 for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
4118 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4119 struct VP9Filter *lflvl_ptr = s->lflvl;
4120 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4122 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4123 set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
4124 tile_col, s->tiling.log2_tile_cols, s->sb_cols);
4127 memset(s->left_partition_ctx, 0, 8);
4128 memset(s->left_skip_ctx, 0, 8);
4129 if (s->keyframe || s->intraonly) {
4130 memset(s->left_mode_ctx, DC_PRED, 16);
4132 memset(s->left_mode_ctx, NEARESTMV, 8);
4134 memset(s->left_y_nnz_ctx, 0, 16);
4135 memset(s->left_uv_nnz_ctx, 0, 32);
4136 memset(s->left_segpred_ctx, 0, 8);
4138 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4141 for (col = s->tiling.tile_col_start;
4142 col < s->tiling.tile_col_end;
4143 col += 8, yoff2 += 64 * bytesperpixel,
4144 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4145 // FIXME integrate with lf code (i.e. zero after each
4146 // use, similar to invtxfm coefficients, or similar)
4148 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4152 decode_sb_mem(ctx, row, col, lflvl_ptr,
4153 yoff2, uvoff2, BL_64X64);
4155 decode_sb(ctx, row, col, lflvl_ptr,
4156 yoff2, uvoff2, BL_64X64);
4160 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4168 // backup pre-loopfilter reconstruction data for intra
4169 // prediction of next row of sb64s
4170 if (row + 8 < s->rows) {
4171 memcpy(s->intra_pred_data[0],
4172 f->data[0] + yoff + 63 * ls_y,
4173 8 * s->cols * bytesperpixel);
4174 memcpy(s->intra_pred_data[1],
4175 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4176 8 * s->cols * bytesperpixel >> s->ss_h);
4177 memcpy(s->intra_pred_data[2],
4178 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4179 8 * s->cols * bytesperpixel >> s->ss_h);
4182 // loopfilter one row
4183 if (s->filter.level) {
4186 lflvl_ptr = s->lflvl;
4187 for (col = 0; col < s->cols;
4188 col += 8, yoff2 += 64 * bytesperpixel,
4189 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4190 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4194 // FIXME maybe we can make this more finegrained by running the
4195 // loopfilter per-block instead of after each sbrow
4196 // In fact that would also make intra pred left preparation easier?
4197 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
4201 if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
4203 ff_thread_finish_setup(ctx);
4205 } while (s->pass++ == 1);
4206 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4209 for (i = 0; i < 8; i++) {
4210 if (s->refs[i].f->data[0])
4211 ff_thread_release_buffer(ctx, &s->refs[i]);
4212 ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
4215 if (!s->invisible) {
4216 if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
4224 static void vp9_decode_flush(AVCodecContext *ctx)
4226 VP9Context *s = ctx->priv_data;
4229 for (i = 0; i < 3; i++)
4230 vp9_unref_frame(ctx, &s->frames[i]);
4231 for (i = 0; i < 8; i++)
4232 ff_thread_release_buffer(ctx, &s->refs[i]);
4235 static int init_frames(AVCodecContext *ctx)
4237 VP9Context *s = ctx->priv_data;
4240 for (i = 0; i < 3; i++) {
4241 s->frames[i].tf.f = av_frame_alloc();
4242 if (!s->frames[i].tf.f) {
4243 vp9_decode_free(ctx);
4244 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4245 return AVERROR(ENOMEM);
4248 for (i = 0; i < 8; i++) {
4249 s->refs[i].f = av_frame_alloc();
4250 s->next_refs[i].f = av_frame_alloc();
4251 if (!s->refs[i].f || !s->next_refs[i].f) {
4252 vp9_decode_free(ctx);
4253 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4254 return AVERROR(ENOMEM);
4261 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4263 VP9Context *s = ctx->priv_data;
4265 ctx->internal->allocate_progress = 1;
4267 s->filter.sharpness = -1;
4269 return init_frames(ctx);
4272 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4274 return init_frames(avctx);
4277 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4280 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4282 // detect size changes in other threads
4283 if (s->intra_pred_data[0] &&
4284 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4288 for (i = 0; i < 3; i++) {
4289 if (s->frames[i].tf.f->data[0])
4290 vp9_unref_frame(dst, &s->frames[i]);
4291 if (ssrc->frames[i].tf.f->data[0]) {
4292 if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4296 for (i = 0; i < 8; i++) {
4297 if (s->refs[i].f->data[0])
4298 ff_thread_release_buffer(dst, &s->refs[i]);
4299 if (ssrc->next_refs[i].f->data[0]) {
4300 if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4305 s->invisible = ssrc->invisible;
4306 s->keyframe = ssrc->keyframe;
4307 s->ss_v = ssrc->ss_v;
4308 s->ss_h = ssrc->ss_h;
4309 s->segmentation.enabled = ssrc->segmentation.enabled;
4310 s->segmentation.update_map = ssrc->segmentation.update_map;
4311 s->bytesperpixel = ssrc->bytesperpixel;
4313 s->bpp_index = ssrc->bpp_index;
4314 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4315 memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4316 if (ssrc->segmentation.enabled) {
4317 memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4318 sizeof(s->segmentation.feat));
4324 static const AVProfile profiles[] = {
4325 { FF_PROFILE_VP9_0, "Profile 0" },
4326 { FF_PROFILE_VP9_1, "Profile 1" },
4327 { FF_PROFILE_VP9_2, "Profile 2" },
4328 { FF_PROFILE_VP9_3, "Profile 3" },
4329 { FF_PROFILE_UNKNOWN },
4332 AVCodec ff_vp9_decoder = {
4334 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4335 .type = AVMEDIA_TYPE_VIDEO,
4336 .id = AV_CODEC_ID_VP9,
4337 .priv_data_size = sizeof(VP9Context),
4338 .init = vp9_decode_init,
4339 .close = vp9_decode_free,
4340 .decode = vp9_decode_frame,
4341 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4342 .flush = vp9_decode_flush,
4343 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4344 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4345 .profiles = NULL_IF_CONFIG_SMALL(profiles),