2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
37 #define VP9_SYNCCODE 0x498342
41 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
45 typedef struct VP9Block {
46 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
47 enum FilterMode filter;
48 VP56mv mv[4 /* b_idx */][2 /* ref */];
50 enum TxfmMode tx, uvtx;
52 enum BlockPartition bp;
55 typedef struct VP9Context {
66 int row, row7, col, col7;
68 ptrdiff_t y_stride, uv_stride;
71 uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
72 uint8_t last_keyframe;
73 enum AVPixelFormat pix_fmt, last_fmt;
74 ThreadFrame next_refs[8];
78 uint8_t mblim_lut[64];
80 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
81 unsigned sb_cols, sb_rows, rows, cols;
84 uint8_t coef[4][2][2][6][6][3];
88 uint8_t coef[4][2][2][6][6][11];
91 unsigned y_mode[4][10];
92 unsigned uv_mode[10][10];
93 unsigned filter[4][3];
94 unsigned mv_mode[7][4];
97 unsigned single_ref[5][2][2];
98 unsigned comp_ref[5][2];
100 unsigned tx16p[2][3];
103 unsigned mv_joint[4];
106 unsigned classes[11];
108 unsigned bits[10][2];
109 unsigned class0_fp[2][4];
111 unsigned class0_hp[2];
114 unsigned partition[4][4][4];
115 unsigned coef[4][2][2][6][6][3];
116 unsigned eob[4][2][2][6][6][2];
119 // contextual (left/above) cache
120 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
121 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
122 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
123 DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
124 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
125 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
126 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
127 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
128 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
129 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
130 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
131 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
132 uint8_t *above_partition_ctx;
133 uint8_t *above_mode_ctx;
134 // FIXME maybe merge some of the below in a flags field?
135 uint8_t *above_y_nnz_ctx;
136 uint8_t *above_uv_nnz_ctx[2];
137 uint8_t *above_skip_ctx; // 1bit
138 uint8_t *above_txfm_ctx; // 2bit
139 uint8_t *above_segpred_ctx; // 1bit
140 uint8_t *above_intra_ctx; // 1bit
141 uint8_t *above_comp_ctx; // 1bit
142 uint8_t *above_ref_ctx; // 2bit
143 uint8_t *above_filter_ctx;
144 VP56mv (*above_mv_ctx)[2];
147 uint8_t *intra_pred_data[3];
148 struct VP9Filter *lflvl;
149 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
151 // block reconstruction intermediates
152 int block_alloc_using_2pass;
153 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
154 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
155 struct { int x, y; } min_mv, max_mv;
156 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
157 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
158 uint16_t mvscale[3][2];
159 uint8_t mvstep[3][2];
162 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
164 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
165 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
167 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
168 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
172 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
174 ff_thread_release_buffer(ctx, &f->tf);
175 av_buffer_unref(&f->extradata);
176 av_buffer_unref(&f->hwaccel_priv_buf);
177 f->segmentation_map = NULL;
178 f->hwaccel_picture_private = NULL;
181 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
183 VP9Context *s = ctx->priv_data;
186 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
188 sz = 64 * s->sb_cols * s->sb_rows;
189 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
193 f->segmentation_map = f->extradata->data;
194 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
197 const AVHWAccel *hwaccel = ctx->hwaccel;
198 av_assert0(!f->hwaccel_picture_private);
199 if (hwaccel->frame_priv_data_size) {
200 f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
201 if (!f->hwaccel_priv_buf)
203 f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
210 vp9_unref_frame(ctx, f);
211 return AVERROR(ENOMEM);
214 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
218 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
220 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
224 dst->segmentation_map = src->segmentation_map;
226 dst->uses_2pass = src->uses_2pass;
228 if (src->hwaccel_picture_private) {
229 dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
230 if (!dst->hwaccel_priv_buf)
232 dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
238 vp9_unref_frame(ctx, dst);
239 return AVERROR(ENOMEM);
242 static int update_size(AVCodecContext *ctx, int w, int h)
244 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
245 enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
246 VP9Context *s = ctx->priv_data;
248 int bytesperpixel = s->bytesperpixel, res;
250 av_assert0(w > 0 && h > 0);
252 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && s->pix_fmt == s->last_fmt)
255 if ((res = ff_set_dimensions(ctx, w, h)) < 0)
258 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
259 #if CONFIG_VP9_DXVA2_HWACCEL
260 *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
262 #if CONFIG_VP9_D3D11VA_HWACCEL
263 *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
265 #if CONFIG_VP9_VAAPI_HWACCEL
266 *fmtp++ = AV_PIX_FMT_VAAPI;
270 *fmtp++ = s->pix_fmt;
271 *fmtp = AV_PIX_FMT_NONE;
273 res = ff_thread_get_format(ctx, pix_fmts);
278 s->last_fmt = s->pix_fmt;
279 s->sb_cols = (w + 63) >> 6;
280 s->sb_rows = (h + 63) >> 6;
281 s->cols = (w + 7) >> 3;
282 s->rows = (h + 7) >> 3;
284 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
285 av_freep(&s->intra_pred_data[0]);
286 // FIXME we slightly over-allocate here for subsampled chroma, but a little
287 // bit of padding shouldn't affect performance...
288 p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
289 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
291 return AVERROR(ENOMEM);
292 assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
293 assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
294 assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
295 assign(s->above_y_nnz_ctx, uint8_t *, 16);
296 assign(s->above_mode_ctx, uint8_t *, 16);
297 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
298 assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
299 assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
300 assign(s->above_partition_ctx, uint8_t *, 8);
301 assign(s->above_skip_ctx, uint8_t *, 8);
302 assign(s->above_txfm_ctx, uint8_t *, 8);
303 assign(s->above_segpred_ctx, uint8_t *, 8);
304 assign(s->above_intra_ctx, uint8_t *, 8);
305 assign(s->above_comp_ctx, uint8_t *, 8);
306 assign(s->above_ref_ctx, uint8_t *, 8);
307 assign(s->above_filter_ctx, uint8_t *, 8);
308 assign(s->lflvl, struct VP9Filter *, 1);
311 // these will be re-allocated a little later
312 av_freep(&s->b_base);
313 av_freep(&s->block_base);
315 if (s->bpp != s->last_bpp) {
316 ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
317 ff_videodsp_init(&s->vdsp, s->bpp);
318 s->last_bpp = s->bpp;
324 static int update_block_buffers(AVCodecContext *ctx)
326 VP9Context *s = ctx->priv_data;
327 int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
329 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
333 av_free(s->block_base);
334 chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
335 chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
336 if (s->s.frames[CUR_FRAME].uses_2pass) {
337 int sbs = s->sb_cols * s->sb_rows;
339 s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
340 s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
341 16 * 16 + 2 * chroma_eobs) * sbs);
342 if (!s->b_base || !s->block_base)
343 return AVERROR(ENOMEM);
344 s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
345 s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
346 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
347 s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
348 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
350 s->b_base = av_malloc(sizeof(VP9Block));
351 s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
352 16 * 16 + 2 * chroma_eobs);
353 if (!s->b_base || !s->block_base)
354 return AVERROR(ENOMEM);
355 s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
356 s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
357 s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
358 s->uveob_base[0] = s->eob_base + 16 * 16;
359 s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
361 s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
366 // for some reason the sign bit is at the end, not the start, of a bit sequence
367 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
369 int v = get_bits(gb, n);
370 return get_bits1(gb) ? -v : v;
373 static av_always_inline int inv_recenter_nonneg(int v, int m)
375 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
378 // differential forward probability updates
379 static int update_prob(VP56RangeCoder *c, int p)
381 static const int inv_map_table[255] = {
382 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
383 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
384 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
385 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
386 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
387 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
388 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
389 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
390 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
391 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
392 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
393 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
394 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
395 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
396 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
397 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
398 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
399 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
404 /* This code is trying to do a differential probability update. For a
405 * current probability A in the range [1, 255], the difference to a new
406 * probability of any value can be expressed differentially as 1-A,255-A
407 * where some part of this (absolute range) exists both in positive as
408 * well as the negative part, whereas another part only exists in one
409 * half. We're trying to code this shared part differentially, i.e.
410 * times two where the value of the lowest bit specifies the sign, and
411 * the single part is then coded on top of this. This absolute difference
412 * then again has a value of [0,254], but a bigger value in this range
413 * indicates that we're further away from the original value A, so we
414 * can code this as a VLC code, since higher values are increasingly
415 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
416 * updates vs. the 'fine, exact' updates further down the range, which
417 * adds one extra dimension to this differential update model. */
419 if (!vp8_rac_get(c)) {
420 d = vp8_rac_get_uint(c, 4) + 0;
421 } else if (!vp8_rac_get(c)) {
422 d = vp8_rac_get_uint(c, 4) + 16;
423 } else if (!vp8_rac_get(c)) {
424 d = vp8_rac_get_uint(c, 5) + 32;
426 d = vp8_rac_get_uint(c, 7);
428 d = (d << 1) - 65 + vp8_rac_get(c);
430 av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
433 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
434 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
437 static int read_colorspace_details(AVCodecContext *ctx)
439 static const enum AVColorSpace colorspaces[8] = {
440 AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
441 AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
443 VP9Context *s = ctx->priv_data;
444 int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
447 s->bpp = 8 + bits * 2;
448 s->bytesperpixel = (7 + s->bpp) >> 3;
449 ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
450 if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
451 static const enum AVPixelFormat pix_fmt_rgb[3] = {
452 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
454 s->ss_h = s->ss_v = 0;
455 ctx->color_range = AVCOL_RANGE_JPEG;
456 s->pix_fmt = pix_fmt_rgb[bits];
457 if (ctx->profile & 1) {
458 if (get_bits1(&s->gb)) {
459 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
460 return AVERROR_INVALIDDATA;
463 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
465 return AVERROR_INVALIDDATA;
468 static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
469 { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
470 { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
471 { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
472 { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
473 { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
474 { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
476 ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
477 if (ctx->profile & 1) {
478 s->ss_h = get_bits1(&s->gb);
479 s->ss_v = get_bits1(&s->gb);
480 s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
481 if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
482 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
484 return AVERROR_INVALIDDATA;
485 } else if (get_bits1(&s->gb)) {
486 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
488 return AVERROR_INVALIDDATA;
491 s->ss_h = s->ss_v = 1;
492 s->pix_fmt = pix_fmt_for_ss[bits][1][1];
499 static int decode_frame_header(AVCodecContext *ctx,
500 const uint8_t *data, int size, int *ref)
502 VP9Context *s = ctx->priv_data;
503 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
505 const uint8_t *data2;
508 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
509 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
512 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
513 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
514 return AVERROR_INVALIDDATA;
516 ctx->profile = get_bits1(&s->gb);
517 ctx->profile |= get_bits1(&s->gb) << 1;
518 if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
519 if (ctx->profile > 3) {
520 av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
521 return AVERROR_INVALIDDATA;
523 s->s.h.profile = ctx->profile;
524 if (get_bits1(&s->gb)) {
525 *ref = get_bits(&s->gb, 3);
528 s->last_keyframe = s->s.h.keyframe;
529 s->s.h.keyframe = !get_bits1(&s->gb);
530 last_invisible = s->s.h.invisible;
531 s->s.h.invisible = !get_bits1(&s->gb);
532 s->s.h.errorres = get_bits1(&s->gb);
533 s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
534 if (s->s.h.keyframe) {
535 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
536 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
537 return AVERROR_INVALIDDATA;
539 if ((res = read_colorspace_details(ctx)) < 0)
541 // for profile 1, here follows the subsampling bits
542 s->s.h.refreshrefmask = 0xff;
543 w = get_bits(&s->gb, 16) + 1;
544 h = get_bits(&s->gb, 16) + 1;
545 if (get_bits1(&s->gb)) // display size
546 skip_bits(&s->gb, 32);
548 s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
549 s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
550 if (s->s.h.intraonly) {
551 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
552 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
553 return AVERROR_INVALIDDATA;
555 if (ctx->profile >= 1) {
556 if ((res = read_colorspace_details(ctx)) < 0)
559 s->ss_h = s->ss_v = 1;
562 s->bytesperpixel = 1;
563 s->pix_fmt = AV_PIX_FMT_YUV420P;
564 ctx->colorspace = AVCOL_SPC_BT470BG;
565 ctx->color_range = AVCOL_RANGE_JPEG;
567 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
568 w = get_bits(&s->gb, 16) + 1;
569 h = get_bits(&s->gb, 16) + 1;
570 if (get_bits1(&s->gb)) // display size
571 skip_bits(&s->gb, 32);
573 s->s.h.refreshrefmask = get_bits(&s->gb, 8);
574 s->s.h.refidx[0] = get_bits(&s->gb, 3);
575 s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
576 s->s.h.refidx[1] = get_bits(&s->gb, 3);
577 s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
578 s->s.h.refidx[2] = get_bits(&s->gb, 3);
579 s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
580 if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
581 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
582 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
583 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
584 return AVERROR_INVALIDDATA;
586 if (get_bits1(&s->gb)) {
587 w = s->s.refs[s->s.h.refidx[0]].f->width;
588 h = s->s.refs[s->s.h.refidx[0]].f->height;
589 } else if (get_bits1(&s->gb)) {
590 w = s->s.refs[s->s.h.refidx[1]].f->width;
591 h = s->s.refs[s->s.h.refidx[1]].f->height;
592 } else if (get_bits1(&s->gb)) {
593 w = s->s.refs[s->s.h.refidx[2]].f->width;
594 h = s->s.refs[s->s.h.refidx[2]].f->height;
596 w = get_bits(&s->gb, 16) + 1;
597 h = get_bits(&s->gb, 16) + 1;
599 // Note that in this code, "CUR_FRAME" is actually before we
600 // have formally allocated a frame, and thus actually represents
602 s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
603 s->s.frames[CUR_FRAME].tf.f->height == h;
604 if (get_bits1(&s->gb)) // display size
605 skip_bits(&s->gb, 32);
606 s->s.h.highprecisionmvs = get_bits1(&s->gb);
607 s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
609 s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
610 s->s.h.signbias[0] != s->s.h.signbias[2];
611 if (s->s.h.allowcompinter) {
612 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
613 s->s.h.fixcompref = 2;
614 s->s.h.varcompref[0] = 0;
615 s->s.h.varcompref[1] = 1;
616 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
617 s->s.h.fixcompref = 1;
618 s->s.h.varcompref[0] = 0;
619 s->s.h.varcompref[1] = 2;
621 s->s.h.fixcompref = 0;
622 s->s.h.varcompref[0] = 1;
623 s->s.h.varcompref[1] = 2;
628 s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
629 s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
630 s->s.h.framectxid = c = get_bits(&s->gb, 2);
632 /* loopfilter header data */
633 if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
634 // reset loopfilter defaults
635 s->s.h.lf_delta.ref[0] = 1;
636 s->s.h.lf_delta.ref[1] = 0;
637 s->s.h.lf_delta.ref[2] = -1;
638 s->s.h.lf_delta.ref[3] = -1;
639 s->s.h.lf_delta.mode[0] = 0;
640 s->s.h.lf_delta.mode[1] = 0;
641 memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
643 s->s.h.filter.level = get_bits(&s->gb, 6);
644 sharp = get_bits(&s->gb, 3);
645 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
646 // the old cache values since they are still valid
647 if (s->s.h.filter.sharpness != sharp)
648 memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
649 s->s.h.filter.sharpness = sharp;
650 if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
651 if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
652 for (i = 0; i < 4; i++)
653 if (get_bits1(&s->gb))
654 s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
655 for (i = 0; i < 2; i++)
656 if (get_bits1(&s->gb))
657 s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
661 /* quantization header data */
662 s->s.h.yac_qi = get_bits(&s->gb, 8);
663 s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
664 s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
665 s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
666 s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
667 s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
669 ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
671 /* segmentation header info */
672 if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
673 if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
674 for (i = 0; i < 7; i++)
675 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
676 get_bits(&s->gb, 8) : 255;
677 if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
678 for (i = 0; i < 3; i++)
679 s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
680 get_bits(&s->gb, 8) : 255;
684 if (get_bits1(&s->gb)) {
685 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
686 for (i = 0; i < 8; i++) {
687 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
688 s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
689 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
690 s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
691 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
692 s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
693 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
698 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
699 for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
700 int qyac, qydc, quvac, quvdc, lflvl, sh;
702 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
703 if (s->s.h.segmentation.absolute_vals)
704 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
706 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
708 qyac = s->s.h.yac_qi;
710 qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
711 quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
712 quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
713 qyac = av_clip_uintp2(qyac, 8);
715 s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
716 s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
717 s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
718 s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
720 sh = s->s.h.filter.level >= 32;
721 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
722 if (s->s.h.segmentation.absolute_vals)
723 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
725 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
727 lflvl = s->s.h.filter.level;
729 if (s->s.h.lf_delta.enabled) {
730 s->s.h.segmentation.feat[i].lflvl[0][0] =
731 s->s.h.segmentation.feat[i].lflvl[0][1] =
732 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] << sh), 6);
733 for (j = 1; j < 4; j++) {
734 s->s.h.segmentation.feat[i].lflvl[j][0] =
735 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
736 s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
737 s->s.h.segmentation.feat[i].lflvl[j][1] =
738 av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
739 s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
742 memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
743 sizeof(s->s.h.segmentation.feat[i].lflvl));
748 if ((res = update_size(ctx, w, h)) < 0) {
749 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
753 for (s->s.h.tiling.log2_tile_cols = 0;
754 s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
755 s->s.h.tiling.log2_tile_cols++) ;
756 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
757 max = FFMAX(0, max - 1);
758 while (max > s->s.h.tiling.log2_tile_cols) {
759 if (get_bits1(&s->gb))
760 s->s.h.tiling.log2_tile_cols++;
764 s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
765 s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
766 if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
767 s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
768 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
769 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
771 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
772 return AVERROR(ENOMEM);
776 /* check reference frames */
777 if (!s->s.h.keyframe && !s->s.h.intraonly) {
778 for (i = 0; i < 3; i++) {
779 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
780 int refw = ref->width, refh = ref->height;
782 if (ref->format != ctx->pix_fmt) {
783 av_log(ctx, AV_LOG_ERROR,
784 "Ref pixfmt (%s) did not match current frame (%s)",
785 av_get_pix_fmt_name(ref->format),
786 av_get_pix_fmt_name(ctx->pix_fmt));
787 return AVERROR_INVALIDDATA;
788 } else if (refw == w && refh == h) {
789 s->mvscale[i][0] = s->mvscale[i][1] = 0;
791 if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
792 av_log(ctx, AV_LOG_ERROR,
793 "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
795 return AVERROR_INVALIDDATA;
797 s->mvscale[i][0] = (refw << 14) / w;
798 s->mvscale[i][1] = (refh << 14) / h;
799 s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
800 s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
805 if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
806 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
807 s->prob_ctx[3].p = vp9_default_probs;
808 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
809 sizeof(vp9_default_coef_probs));
810 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
811 sizeof(vp9_default_coef_probs));
812 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
813 sizeof(vp9_default_coef_probs));
814 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
815 sizeof(vp9_default_coef_probs));
816 } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
817 s->prob_ctx[c].p = vp9_default_probs;
818 memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
819 sizeof(vp9_default_coef_probs));
822 // next 16 bits is size of the rest of the header (arith-coded)
823 s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
824 s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
826 data2 = align_get_bits(&s->gb);
827 if (size2 > size - (data2 - data)) {
828 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
829 return AVERROR_INVALIDDATA;
831 ff_vp56_init_range_decoder(&s->c, data2, size2);
832 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
833 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
834 return AVERROR_INVALIDDATA;
837 if (s->s.h.keyframe || s->s.h.intraonly) {
838 memset(s->counts.coef, 0, sizeof(s->counts.coef));
839 memset(s->counts.eob, 0, sizeof(s->counts.eob));
841 memset(&s->counts, 0, sizeof(s->counts));
843 // FIXME is it faster to not copy here, but do it down in the fw updates
844 // as explicit copies if the fw update is missing (and skip the copy upon
846 s->prob.p = s->prob_ctx[c].p;
849 if (s->s.h.lossless) {
850 s->s.h.txfmmode = TX_4X4;
852 s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
853 if (s->s.h.txfmmode == 3)
854 s->s.h.txfmmode += vp8_rac_get(&s->c);
856 if (s->s.h.txfmmode == TX_SWITCHABLE) {
857 for (i = 0; i < 2; i++)
858 if (vp56_rac_get_prob_branchy(&s->c, 252))
859 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
860 for (i = 0; i < 2; i++)
861 for (j = 0; j < 2; j++)
862 if (vp56_rac_get_prob_branchy(&s->c, 252))
863 s->prob.p.tx16p[i][j] =
864 update_prob(&s->c, s->prob.p.tx16p[i][j]);
865 for (i = 0; i < 2; i++)
866 for (j = 0; j < 3; j++)
867 if (vp56_rac_get_prob_branchy(&s->c, 252))
868 s->prob.p.tx32p[i][j] =
869 update_prob(&s->c, s->prob.p.tx32p[i][j]);
874 for (i = 0; i < 4; i++) {
875 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
876 if (vp8_rac_get(&s->c)) {
877 for (j = 0; j < 2; j++)
878 for (k = 0; k < 2; k++)
879 for (l = 0; l < 6; l++)
880 for (m = 0; m < 6; m++) {
881 uint8_t *p = s->prob.coef[i][j][k][l][m];
882 uint8_t *r = ref[j][k][l][m];
883 if (m >= 3 && l == 0) // dc only has 3 pt
885 for (n = 0; n < 3; n++) {
886 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
887 p[n] = update_prob(&s->c, r[n]);
895 for (j = 0; j < 2; j++)
896 for (k = 0; k < 2; k++)
897 for (l = 0; l < 6; l++)
898 for (m = 0; m < 6; m++) {
899 uint8_t *p = s->prob.coef[i][j][k][l][m];
900 uint8_t *r = ref[j][k][l][m];
901 if (m > 3 && l == 0) // dc only has 3 pt
907 if (s->s.h.txfmmode == i)
912 for (i = 0; i < 3; i++)
913 if (vp56_rac_get_prob_branchy(&s->c, 252))
914 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
915 if (!s->s.h.keyframe && !s->s.h.intraonly) {
916 for (i = 0; i < 7; i++)
917 for (j = 0; j < 3; j++)
918 if (vp56_rac_get_prob_branchy(&s->c, 252))
919 s->prob.p.mv_mode[i][j] =
920 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
922 if (s->s.h.filtermode == FILTER_SWITCHABLE)
923 for (i = 0; i < 4; i++)
924 for (j = 0; j < 2; j++)
925 if (vp56_rac_get_prob_branchy(&s->c, 252))
926 s->prob.p.filter[i][j] =
927 update_prob(&s->c, s->prob.p.filter[i][j]);
929 for (i = 0; i < 4; i++)
930 if (vp56_rac_get_prob_branchy(&s->c, 252))
931 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
933 if (s->s.h.allowcompinter) {
934 s->s.h.comppredmode = vp8_rac_get(&s->c);
935 if (s->s.h.comppredmode)
936 s->s.h.comppredmode += vp8_rac_get(&s->c);
937 if (s->s.h.comppredmode == PRED_SWITCHABLE)
938 for (i = 0; i < 5; i++)
939 if (vp56_rac_get_prob_branchy(&s->c, 252))
941 update_prob(&s->c, s->prob.p.comp[i]);
943 s->s.h.comppredmode = PRED_SINGLEREF;
946 if (s->s.h.comppredmode != PRED_COMPREF) {
947 for (i = 0; i < 5; i++) {
948 if (vp56_rac_get_prob_branchy(&s->c, 252))
949 s->prob.p.single_ref[i][0] =
950 update_prob(&s->c, s->prob.p.single_ref[i][0]);
951 if (vp56_rac_get_prob_branchy(&s->c, 252))
952 s->prob.p.single_ref[i][1] =
953 update_prob(&s->c, s->prob.p.single_ref[i][1]);
957 if (s->s.h.comppredmode != PRED_SINGLEREF) {
958 for (i = 0; i < 5; i++)
959 if (vp56_rac_get_prob_branchy(&s->c, 252))
960 s->prob.p.comp_ref[i] =
961 update_prob(&s->c, s->prob.p.comp_ref[i]);
964 for (i = 0; i < 4; i++)
965 for (j = 0; j < 9; j++)
966 if (vp56_rac_get_prob_branchy(&s->c, 252))
967 s->prob.p.y_mode[i][j] =
968 update_prob(&s->c, s->prob.p.y_mode[i][j]);
970 for (i = 0; i < 4; i++)
971 for (j = 0; j < 4; j++)
972 for (k = 0; k < 3; k++)
973 if (vp56_rac_get_prob_branchy(&s->c, 252))
974 s->prob.p.partition[3 - i][j][k] =
975 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
977 // mv fields don't use the update_prob subexp model for some reason
978 for (i = 0; i < 3; i++)
979 if (vp56_rac_get_prob_branchy(&s->c, 252))
980 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
982 for (i = 0; i < 2; i++) {
983 if (vp56_rac_get_prob_branchy(&s->c, 252))
984 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
986 for (j = 0; j < 10; j++)
987 if (vp56_rac_get_prob_branchy(&s->c, 252))
988 s->prob.p.mv_comp[i].classes[j] =
989 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
991 if (vp56_rac_get_prob_branchy(&s->c, 252))
992 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
994 for (j = 0; j < 10; j++)
995 if (vp56_rac_get_prob_branchy(&s->c, 252))
996 s->prob.p.mv_comp[i].bits[j] =
997 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1000 for (i = 0; i < 2; i++) {
1001 for (j = 0; j < 2; j++)
1002 for (k = 0; k < 3; k++)
1003 if (vp56_rac_get_prob_branchy(&s->c, 252))
1004 s->prob.p.mv_comp[i].class0_fp[j][k] =
1005 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1007 for (j = 0; j < 3; j++)
1008 if (vp56_rac_get_prob_branchy(&s->c, 252))
1009 s->prob.p.mv_comp[i].fp[j] =
1010 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1013 if (s->s.h.highprecisionmvs) {
1014 for (i = 0; i < 2; i++) {
1015 if (vp56_rac_get_prob_branchy(&s->c, 252))
1016 s->prob.p.mv_comp[i].class0_hp =
1017 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1019 if (vp56_rac_get_prob_branchy(&s->c, 252))
1020 s->prob.p.mv_comp[i].hp =
1021 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1026 return (data2 - data) + size2;
1029 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1032 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1033 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1036 static void find_ref_mvs(VP9Context *s,
1037 VP56mv *pmv, int ref, int z, int idx, int sb)
1039 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1040 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1041 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1042 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1043 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1044 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1045 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1046 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1047 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1048 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1049 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1050 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1051 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1052 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1053 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1054 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1055 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1056 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1057 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1058 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1059 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1060 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1061 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1062 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1063 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1064 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1065 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1068 int row = s->row, col = s->col, row7 = s->row7;
1069 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1070 #define INVALID_MV 0x80008000U
1071 uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1074 #define RETURN_DIRECT_MV(mv) \
1076 uint32_t m = AV_RN32A(&mv); \
1080 } else if (mem == INVALID_MV) { \
1082 } else if (m != mem) { \
1089 if (sb == 2 || sb == 1) {
1090 RETURN_DIRECT_MV(b->mv[0][z]);
1091 } else if (sb == 3) {
1092 RETURN_DIRECT_MV(b->mv[2][z]);
1093 RETURN_DIRECT_MV(b->mv[1][z]);
1094 RETURN_DIRECT_MV(b->mv[0][z]);
1097 #define RETURN_MV(mv) \
1102 av_assert2(idx == 1); \
1103 av_assert2(mem != INVALID_MV); \
1104 if (mem_sub8x8 == INVALID_MV) { \
1105 clamp_mv(&tmp, &mv, s); \
1106 m = AV_RN32A(&tmp); \
1111 mem_sub8x8 = AV_RN32A(&mv); \
1112 } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1113 clamp_mv(&tmp, &mv, s); \
1114 m = AV_RN32A(&tmp); \
1118 /* BUG I'm pretty sure this isn't the intention */ \
1124 uint32_t m = AV_RN32A(&mv); \
1126 clamp_mv(pmv, &mv, s); \
1128 } else if (mem == INVALID_MV) { \
1130 } else if (m != mem) { \
1131 clamp_mv(pmv, &mv, s); \
1138 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1139 if (mv->ref[0] == ref) {
1140 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1141 } else if (mv->ref[1] == ref) {
1142 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1145 if (col > s->tile_col_start) {
1146 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1147 if (mv->ref[0] == ref) {
1148 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1149 } else if (mv->ref[1] == ref) {
1150 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1158 // previously coded MVs in this neighbourhood, using same reference frame
1159 for (; i < 8; i++) {
1160 int c = p[i][0] + col, r = p[i][1] + row;
1162 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1163 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1165 if (mv->ref[0] == ref) {
1166 RETURN_MV(mv->mv[0]);
1167 } else if (mv->ref[1] == ref) {
1168 RETURN_MV(mv->mv[1]);
1173 // MV at this position in previous frame, using same reference frame
1174 if (s->s.h.use_last_frame_mvs) {
1175 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1177 if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1178 ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1179 if (mv->ref[0] == ref) {
1180 RETURN_MV(mv->mv[0]);
1181 } else if (mv->ref[1] == ref) {
1182 RETURN_MV(mv->mv[1]);
1186 #define RETURN_SCALE_MV(mv, scale) \
1189 VP56mv mv_temp = { -mv.x, -mv.y }; \
1190 RETURN_MV(mv_temp); \
1196 // previously coded MVs in this neighbourhood, using different reference frame
1197 for (i = 0; i < 8; i++) {
1198 int c = p[i][0] + col, r = p[i][1] + row;
1200 if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1201 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1203 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1204 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1206 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1207 // BUG - libvpx has this condition regardless of whether
1208 // we used the first ref MV and pre-scaling
1209 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1210 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1215 // MV at this position in previous frame, using different reference frame
1216 if (s->s.h.use_last_frame_mvs) {
1217 struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1219 // no need to await_progress, because we already did that above
1220 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1221 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1223 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1224 // BUG - libvpx has this condition regardless of whether
1225 // we used the first ref MV and pre-scaling
1226 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1227 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1232 clamp_mv(pmv, pmv, s);
1235 #undef RETURN_SCALE_MV
1238 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1240 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1241 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1242 s->prob.p.mv_comp[idx].classes);
1244 s->counts.mv_comp[idx].sign[sign]++;
1245 s->counts.mv_comp[idx].classes[c]++;
1249 for (n = 0, m = 0; m < c; m++) {
1250 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1252 s->counts.mv_comp[idx].bits[m][bit]++;
1255 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1257 s->counts.mv_comp[idx].fp[bit]++;
1259 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1260 s->counts.mv_comp[idx].hp[bit]++;
1264 // bug in libvpx - we count for bw entropy purposes even if the
1266 s->counts.mv_comp[idx].hp[1]++;
1270 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1271 s->counts.mv_comp[idx].class0[n]++;
1272 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1273 s->prob.p.mv_comp[idx].class0_fp[n]);
1274 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1275 n = (n << 3) | (bit << 1);
1277 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1278 s->counts.mv_comp[idx].class0_hp[bit]++;
1282 // bug in libvpx - we count for bw entropy purposes even if the
1284 s->counts.mv_comp[idx].class0_hp[1]++;
1288 return sign ? -(n + 1) : (n + 1);
1291 static void fill_mv(VP9Context *s,
1292 VP56mv *mv, int mode, int sb)
1296 if (mode == ZEROMV) {
1301 // FIXME cache this value and reuse for other subblocks
1302 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1303 mode == NEWMV ? -1 : sb);
1304 // FIXME maybe move this code into find_ref_mvs()
1305 if ((mode == NEWMV || sb == -1) &&
1306 !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1320 if (mode == NEWMV) {
1321 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1322 s->prob.p.mv_joint);
1324 s->counts.mv_joint[j]++;
1325 if (j >= MV_JOINT_V)
1326 mv[0].y += read_mv_component(s, 0, hp);
1328 mv[0].x += read_mv_component(s, 1, hp);
1332 // FIXME cache this value and reuse for other subblocks
1333 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1334 mode == NEWMV ? -1 : sb);
1335 if ((mode == NEWMV || sb == -1) &&
1336 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1350 if (mode == NEWMV) {
1351 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1352 s->prob.p.mv_joint);
1354 s->counts.mv_joint[j]++;
1355 if (j >= MV_JOINT_V)
1356 mv[1].y += read_mv_component(s, 0, hp);
1358 mv[1].x += read_mv_component(s, 1, hp);
1364 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1365 ptrdiff_t stride, int v)
1375 int v16 = v * 0x0101;
1383 uint32_t v32 = v * 0x01010101;
1392 uint64_t v64 = v * 0x0101010101010101ULL;
1398 uint32_t v32 = v * 0x01010101;
1401 AV_WN32A(ptr + 4, v32);
1410 static void decode_mode(AVCodecContext *ctx)
1412 static const uint8_t left_ctx[N_BS_SIZES] = {
1413 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1415 static const uint8_t above_ctx[N_BS_SIZES] = {
1416 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1418 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1419 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1420 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1422 VP9Context *s = ctx->priv_data;
1424 int row = s->row, col = s->col, row7 = s->row7;
1425 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1426 int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1427 int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1428 int have_a = row > 0, have_l = col > s->tile_col_start;
1429 int vref, filter_id;
1431 if (!s->s.h.segmentation.enabled) {
1433 } else if (s->s.h.keyframe || s->s.h.intraonly) {
1434 b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1435 vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1436 } else if (!s->s.h.segmentation.update_map ||
1437 (s->s.h.segmentation.temporal &&
1438 vp56_rac_get_prob_branchy(&s->c,
1439 s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1440 s->left_segpred_ctx[row7]]))) {
1441 if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1443 uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1445 if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1446 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1447 for (y = 0; y < h4; y++) {
1448 int idx_base = (y + row) * 8 * s->sb_cols + col;
1449 for (x = 0; x < w4; x++)
1450 pred = FFMIN(pred, refsegmap[idx_base + x]);
1452 av_assert1(pred < 8);
1458 memset(&s->above_segpred_ctx[col], 1, w4);
1459 memset(&s->left_segpred_ctx[row7], 1, h4);
1461 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1462 s->s.h.segmentation.prob);
1464 memset(&s->above_segpred_ctx[col], 0, w4);
1465 memset(&s->left_segpred_ctx[row7], 0, h4);
1467 if (s->s.h.segmentation.enabled &&
1468 (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1469 setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1470 bw4, bh4, 8 * s->sb_cols, b->seg_id);
1473 b->skip = s->s.h.segmentation.enabled &&
1474 s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1476 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1477 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1478 s->counts.skip[c][b->skip]++;
1481 if (s->s.h.keyframe || s->s.h.intraonly) {
1483 } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1484 b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1488 if (have_a && have_l) {
1489 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1492 c = have_a ? 2 * s->above_intra_ctx[col] :
1493 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1495 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1496 s->counts.intra[c][bit]++;
1500 if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1504 c = (s->above_skip_ctx[col] ? max_tx :
1505 s->above_txfm_ctx[col]) +
1506 (s->left_skip_ctx[row7] ? max_tx :
1507 s->left_txfm_ctx[row7]) > max_tx;
1509 c = s->above_skip_ctx[col] ? 1 :
1510 (s->above_txfm_ctx[col] * 2 > max_tx);
1512 } else if (have_l) {
1513 c = s->left_skip_ctx[row7] ? 1 :
1514 (s->left_txfm_ctx[row7] * 2 > max_tx);
1520 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1522 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1524 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1526 s->counts.tx32p[c][b->tx]++;
1529 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1531 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1532 s->counts.tx16p[c][b->tx]++;
1535 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1536 s->counts.tx8p[c][b->tx]++;
1543 b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1546 if (s->s.h.keyframe || s->s.h.intraonly) {
1547 uint8_t *a = &s->above_mode_ctx[col * 2];
1548 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1551 if (b->bs > BS_8x8) {
1552 // FIXME the memory storage intermediates here aren't really
1553 // necessary, they're just there to make the code slightly
1555 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1556 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1557 if (b->bs != BS_8x4) {
1558 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1559 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1560 l[0] = a[1] = b->mode[1];
1562 l[0] = a[1] = b->mode[1] = b->mode[0];
1564 if (b->bs != BS_4x8) {
1565 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1566 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1567 if (b->bs != BS_8x4) {
1568 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1569 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1570 l[1] = a[1] = b->mode[3];
1572 l[1] = a[1] = b->mode[3] = b->mode[2];
1575 b->mode[2] = b->mode[0];
1576 l[1] = a[1] = b->mode[3] = b->mode[1];
1579 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1580 vp9_default_kf_ymode_probs[*a][*l]);
1581 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1582 // FIXME this can probably be optimized
1583 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1584 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1586 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1587 vp9_default_kf_uvmode_probs[b->mode[3]]);
1588 } else if (b->intra) {
1590 if (b->bs > BS_8x8) {
1591 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1592 s->prob.p.y_mode[0]);
1593 s->counts.y_mode[0][b->mode[0]]++;
1594 if (b->bs != BS_8x4) {
1595 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1596 s->prob.p.y_mode[0]);
1597 s->counts.y_mode[0][b->mode[1]]++;
1599 b->mode[1] = b->mode[0];
1601 if (b->bs != BS_4x8) {
1602 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1603 s->prob.p.y_mode[0]);
1604 s->counts.y_mode[0][b->mode[2]]++;
1605 if (b->bs != BS_8x4) {
1606 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1607 s->prob.p.y_mode[0]);
1608 s->counts.y_mode[0][b->mode[3]]++;
1610 b->mode[3] = b->mode[2];
1613 b->mode[2] = b->mode[0];
1614 b->mode[3] = b->mode[1];
1617 static const uint8_t size_group[10] = {
1618 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1620 int sz = size_group[b->bs];
1622 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1623 s->prob.p.y_mode[sz]);
1624 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1625 s->counts.y_mode[sz][b->mode[3]]++;
1627 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1628 s->prob.p.uv_mode[b->mode[3]]);
1629 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1631 static const uint8_t inter_mode_ctx_lut[14][14] = {
1632 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1633 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1634 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1635 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1636 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1637 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1638 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1639 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1640 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1641 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1642 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1643 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1644 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1645 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1648 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1649 av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1651 b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1653 // read comp_pred flag
1654 if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1655 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1659 // FIXME add intra as ref=0xff (or -1) to make these easier?
1662 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1664 } else if (s->above_comp_ctx[col]) {
1665 c = 2 + (s->left_intra_ctx[row7] ||
1666 s->left_ref_ctx[row7] == s->s.h.fixcompref);
1667 } else if (s->left_comp_ctx[row7]) {
1668 c = 2 + (s->above_intra_ctx[col] ||
1669 s->above_ref_ctx[col] == s->s.h.fixcompref);
1671 c = (!s->above_intra_ctx[col] &&
1672 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1673 (!s->left_intra_ctx[row7] &&
1674 s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1677 c = s->above_comp_ctx[col] ? 3 :
1678 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1680 } else if (have_l) {
1681 c = s->left_comp_ctx[row7] ? 3 :
1682 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1686 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1687 s->counts.comp[c][b->comp]++;
1690 // read actual references
1691 // FIXME probably cache a few variables here to prevent repetitive
1692 // memory accesses below
1693 if (b->comp) /* two references */ {
1694 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1696 b->ref[fix_idx] = s->s.h.fixcompref;
1697 // FIXME can this codeblob be replaced by some sort of LUT?
1700 if (s->above_intra_ctx[col]) {
1701 if (s->left_intra_ctx[row7]) {
1704 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1706 } else if (s->left_intra_ctx[row7]) {
1707 c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1709 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1711 if (refl == refa && refa == s->s.h.varcompref[1]) {
1713 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1714 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1715 (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1718 c = (refa == refl) ? 3 : 1;
1720 } else if (!s->left_comp_ctx[row7]) {
1721 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1724 c = (refl == s->s.h.varcompref[1] &&
1725 refa != s->s.h.varcompref[1]) ? 2 : 4;
1727 } else if (!s->above_comp_ctx[col]) {
1728 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1731 c = (refa == s->s.h.varcompref[1] &&
1732 refl != s->s.h.varcompref[1]) ? 2 : 4;
1735 c = (refl == refa) ? 4 : 2;
1739 if (s->above_intra_ctx[col]) {
1741 } else if (s->above_comp_ctx[col]) {
1742 c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1744 c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1747 } else if (have_l) {
1748 if (s->left_intra_ctx[row7]) {
1750 } else if (s->left_comp_ctx[row7]) {
1751 c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1753 c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1758 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1759 b->ref[var_idx] = s->s.h.varcompref[bit];
1760 s->counts.comp_ref[c][bit]++;
1761 } else /* single reference */ {
1764 if (have_a && !s->above_intra_ctx[col]) {
1765 if (have_l && !s->left_intra_ctx[row7]) {
1766 if (s->left_comp_ctx[row7]) {
1767 if (s->above_comp_ctx[col]) {
1768 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1769 !s->above_ref_ctx[col]);
1771 c = (3 * !s->above_ref_ctx[col]) +
1772 (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1774 } else if (s->above_comp_ctx[col]) {
1775 c = (3 * !s->left_ref_ctx[row7]) +
1776 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1778 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1780 } else if (s->above_intra_ctx[col]) {
1782 } else if (s->above_comp_ctx[col]) {
1783 c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1785 c = 4 * (!s->above_ref_ctx[col]);
1787 } else if (have_l && !s->left_intra_ctx[row7]) {
1788 if (s->left_intra_ctx[row7]) {
1790 } else if (s->left_comp_ctx[row7]) {
1791 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1793 c = 4 * (!s->left_ref_ctx[row7]);
1798 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1799 s->counts.single_ref[c][0][bit]++;
1803 // FIXME can this codeblob be replaced by some sort of LUT?
1806 if (s->left_intra_ctx[row7]) {
1807 if (s->above_intra_ctx[col]) {
1809 } else if (s->above_comp_ctx[col]) {
1810 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1811 s->above_ref_ctx[col] == 1);
1812 } else if (!s->above_ref_ctx[col]) {
1815 c = 4 * (s->above_ref_ctx[col] == 1);
1817 } else if (s->above_intra_ctx[col]) {
1818 if (s->left_intra_ctx[row7]) {
1820 } else if (s->left_comp_ctx[row7]) {
1821 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1822 s->left_ref_ctx[row7] == 1);
1823 } else if (!s->left_ref_ctx[row7]) {
1826 c = 4 * (s->left_ref_ctx[row7] == 1);
1828 } else if (s->above_comp_ctx[col]) {
1829 if (s->left_comp_ctx[row7]) {
1830 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1831 c = 3 * (s->s.h.fixcompref == 1 ||
1832 s->left_ref_ctx[row7] == 1);
1836 } else if (!s->left_ref_ctx[row7]) {
1837 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1838 s->above_ref_ctx[col] == 1);
1840 c = 3 * (s->left_ref_ctx[row7] == 1) +
1841 (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1843 } else if (s->left_comp_ctx[row7]) {
1844 if (!s->above_ref_ctx[col]) {
1845 c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1846 s->left_ref_ctx[row7] == 1);
1848 c = 3 * (s->above_ref_ctx[col] == 1) +
1849 (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1851 } else if (!s->above_ref_ctx[col]) {
1852 if (!s->left_ref_ctx[row7]) {
1855 c = 4 * (s->left_ref_ctx[row7] == 1);
1857 } else if (!s->left_ref_ctx[row7]) {
1858 c = 4 * (s->above_ref_ctx[col] == 1);
1860 c = 2 * (s->left_ref_ctx[row7] == 1) +
1861 2 * (s->above_ref_ctx[col] == 1);
1864 if (s->above_intra_ctx[col] ||
1865 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1867 } else if (s->above_comp_ctx[col]) {
1868 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1870 c = 4 * (s->above_ref_ctx[col] == 1);
1873 } else if (have_l) {
1874 if (s->left_intra_ctx[row7] ||
1875 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1877 } else if (s->left_comp_ctx[row7]) {
1878 c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1880 c = 4 * (s->left_ref_ctx[row7] == 1);
1885 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1886 s->counts.single_ref[c][1][bit]++;
1887 b->ref[0] = 1 + bit;
1892 if (b->bs <= BS_8x8) {
1893 if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1894 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1896 static const uint8_t off[10] = {
1897 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1900 // FIXME this needs to use the LUT tables from find_ref_mvs
1901 // because not all are -1,0/0,-1
1902 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1903 [s->left_mode_ctx[row7 + off[b->bs]]];
1905 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1906 s->prob.p.mv_mode[c]);
1907 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1908 s->counts.mv_mode[c][b->mode[0] - 10]++;
1912 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1915 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1916 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1917 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1918 s->left_filter_ctx[row7] : 3;
1920 c = s->above_filter_ctx[col];
1922 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1923 c = s->left_filter_ctx[row7];
1928 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1929 s->prob.p.filter[c]);
1930 s->counts.filter[c][filter_id]++;
1931 b->filter = vp9_filter_lut[filter_id];
1933 b->filter = s->s.h.filtermode;
1936 if (b->bs > BS_8x8) {
1937 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1939 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1940 s->prob.p.mv_mode[c]);
1941 s->counts.mv_mode[c][b->mode[0] - 10]++;
1942 fill_mv(s, b->mv[0], b->mode[0], 0);
1944 if (b->bs != BS_8x4) {
1945 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1946 s->prob.p.mv_mode[c]);
1947 s->counts.mv_mode[c][b->mode[1] - 10]++;
1948 fill_mv(s, b->mv[1], b->mode[1], 1);
1950 b->mode[1] = b->mode[0];
1951 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1952 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1955 if (b->bs != BS_4x8) {
1956 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1957 s->prob.p.mv_mode[c]);
1958 s->counts.mv_mode[c][b->mode[2] - 10]++;
1959 fill_mv(s, b->mv[2], b->mode[2], 2);
1961 if (b->bs != BS_8x4) {
1962 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1963 s->prob.p.mv_mode[c]);
1964 s->counts.mv_mode[c][b->mode[3] - 10]++;
1965 fill_mv(s, b->mv[3], b->mode[3], 3);
1967 b->mode[3] = b->mode[2];
1968 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1969 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1972 b->mode[2] = b->mode[0];
1973 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1974 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1975 b->mode[3] = b->mode[1];
1976 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1977 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1980 fill_mv(s, b->mv[0], b->mode[0], -1);
1981 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1982 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1983 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1984 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1985 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1986 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1989 vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
1993 #define SPLAT_CTX(var, val, n) \
1995 case 1: var = val; break; \
1996 case 2: AV_WN16A(&var, val * 0x0101); break; \
1997 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1998 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2000 uint64_t v64 = val * 0x0101010101010101ULL; \
2001 AV_WN64A( &var, v64); \
2002 AV_WN64A(&((uint8_t *) &var)[8], v64); \
2007 #define SPLAT_CTX(var, val, n) \
2009 case 1: var = val; break; \
2010 case 2: AV_WN16A(&var, val * 0x0101); break; \
2011 case 4: AV_WN32A(&var, val * 0x01010101); break; \
2013 uint32_t v32 = val * 0x01010101; \
2014 AV_WN32A( &var, v32); \
2015 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2019 uint32_t v32 = val * 0x01010101; \
2020 AV_WN32A( &var, v32); \
2021 AV_WN32A(&((uint8_t *) &var)[4], v32); \
2022 AV_WN32A(&((uint8_t *) &var)[8], v32); \
2023 AV_WN32A(&((uint8_t *) &var)[12], v32); \
2029 switch (bwh_tab[1][b->bs][0]) {
2030 #define SET_CTXS(dir, off, n) \
2032 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2033 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2034 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2035 if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2036 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2037 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2038 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2040 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2041 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2042 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2047 case 1: SET_CTXS(above, col, 1); break;
2048 case 2: SET_CTXS(above, col, 2); break;
2049 case 4: SET_CTXS(above, col, 4); break;
2050 case 8: SET_CTXS(above, col, 8); break;
2052 switch (bwh_tab[1][b->bs][1]) {
2053 case 1: SET_CTXS(left, row7, 1); break;
2054 case 2: SET_CTXS(left, row7, 2); break;
2055 case 4: SET_CTXS(left, row7, 4); break;
2056 case 8: SET_CTXS(left, row7, 8); break;
2061 if (!s->s.h.keyframe && !s->s.h.intraonly) {
2062 if (b->bs > BS_8x8) {
2063 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2065 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2066 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2067 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2068 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2069 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2070 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2071 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2072 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2074 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2076 for (n = 0; n < w4 * 2; n++) {
2077 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2078 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2080 for (n = 0; n < h4 * 2; n++) {
2081 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2082 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2088 for (y = 0; y < h4; y++) {
2089 int x, o = (row + y) * s->sb_cols * 8 + col;
2090 struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2093 for (x = 0; x < w4; x++) {
2097 } else if (b->comp) {
2098 for (x = 0; x < w4; x++) {
2099 mv[x].ref[0] = b->ref[0];
2100 mv[x].ref[1] = b->ref[1];
2101 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2102 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2105 for (x = 0; x < w4; x++) {
2106 mv[x].ref[0] = b->ref[0];
2108 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2114 // FIXME merge cnt/eob arguments?
2115 static av_always_inline int
2116 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2117 int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2118 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2119 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2120 const int16_t *band_counts, const int16_t *qmul)
2122 int i = 0, band = 0, band_left = band_counts[band];
2123 uint8_t *tp = p[0][nnz];
2124 uint8_t cache[1024];
2129 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2130 eob[band][nnz][val]++;
2135 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2136 cnt[band][nnz][0]++;
2138 band_left = band_counts[++band];
2140 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2142 if (++i == n_coeffs)
2143 break; //invalid input; blocks should end with EOB
2148 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2149 cnt[band][nnz][1]++;
2153 // fill in p[3-10] (model fill) - only once per frame for each pos
2155 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2157 cnt[band][nnz][2]++;
2158 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2159 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2160 cache[rc] = val = 2;
2162 val = 3 + vp56_rac_get_prob(c, tp[5]);
2165 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2167 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2168 val = 5 + vp56_rac_get_prob(c, 159);
2170 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2171 val += vp56_rac_get_prob(c, 145);
2175 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2176 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2177 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2178 val += (vp56_rac_get_prob(c, 148) << 1);
2179 val += vp56_rac_get_prob(c, 140);
2181 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2182 val += (vp56_rac_get_prob(c, 155) << 2);
2183 val += (vp56_rac_get_prob(c, 140) << 1);
2184 val += vp56_rac_get_prob(c, 135);
2186 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2187 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2188 val += (vp56_rac_get_prob(c, 157) << 3);
2189 val += (vp56_rac_get_prob(c, 141) << 2);
2190 val += (vp56_rac_get_prob(c, 134) << 1);
2191 val += vp56_rac_get_prob(c, 130);
2194 if (!is8bitsperpixel) {
2196 val += vp56_rac_get_prob(c, 255) << 17;
2197 val += vp56_rac_get_prob(c, 255) << 16;
2199 val += (vp56_rac_get_prob(c, 255) << 15);
2200 val += (vp56_rac_get_prob(c, 255) << 14);
2202 val += (vp56_rac_get_prob(c, 254) << 13);
2203 val += (vp56_rac_get_prob(c, 254) << 12);
2204 val += (vp56_rac_get_prob(c, 254) << 11);
2205 val += (vp56_rac_get_prob(c, 252) << 10);
2206 val += (vp56_rac_get_prob(c, 249) << 9);
2207 val += (vp56_rac_get_prob(c, 243) << 8);
2208 val += (vp56_rac_get_prob(c, 230) << 7);
2209 val += (vp56_rac_get_prob(c, 196) << 6);
2210 val += (vp56_rac_get_prob(c, 177) << 5);
2211 val += (vp56_rac_get_prob(c, 153) << 4);
2212 val += (vp56_rac_get_prob(c, 140) << 3);
2213 val += (vp56_rac_get_prob(c, 133) << 2);
2214 val += (vp56_rac_get_prob(c, 130) << 1);
2215 val += vp56_rac_get_prob(c, 129);
2219 #define STORE_COEF(c, i, v) do { \
2220 if (is8bitsperpixel) { \
2223 AV_WN32A(&c[i * 2], v); \
2227 band_left = band_counts[++band];
2229 STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2231 STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2232 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2234 } while (++i < n_coeffs);
2239 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2240 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2241 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2242 const int16_t (*nb)[2], const int16_t *band_counts,
2243 const int16_t *qmul)
2245 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2246 nnz, scan, nb, band_counts, qmul);
2249 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2250 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2251 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2252 const int16_t (*nb)[2], const int16_t *band_counts,
2253 const int16_t *qmul)
2255 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2256 nnz, scan, nb, band_counts, qmul);
2259 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2260 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2261 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2262 const int16_t (*nb)[2], const int16_t *band_counts,
2263 const int16_t *qmul)
2265 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2266 nnz, scan, nb, band_counts, qmul);
2269 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2270 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2271 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2272 const int16_t (*nb)[2], const int16_t *band_counts,
2273 const int16_t *qmul)
2275 return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2276 nnz, scan, nb, band_counts, qmul);
2279 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2281 VP9Context *s = ctx->priv_data;
2283 int row = s->row, col = s->col;
2284 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2285 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2286 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2287 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2288 int end_x = FFMIN(2 * (s->cols - col), w4);
2289 int end_y = FFMIN(2 * (s->rows - row), h4);
2290 int n, pl, x, y, res;
2291 int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2292 int tx = 4 * s->s.h.lossless + b->tx;
2293 const int16_t * const *yscans = vp9_scans[tx];
2294 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2295 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2296 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2297 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2298 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2299 static const int16_t band_counts[4][8] = {
2300 { 1, 2, 3, 4, 3, 16 - 13 },
2301 { 1, 2, 3, 4, 11, 64 - 21 },
2302 { 1, 2, 3, 4, 11, 256 - 21 },
2303 { 1, 2, 3, 4, 11, 1024 - 21 },
2305 const int16_t *y_band_counts = band_counts[b->tx];
2306 const int16_t *uv_band_counts = band_counts[b->uvtx];
2307 int bytesperpixel = is8bitsperpixel ? 1 : 2;
2308 int total_coeff = 0;
2310 #define MERGE(la, end, step, rd) \
2311 for (n = 0; n < end; n += step) \
2312 la[n] = !!rd(&la[n])
2313 #define MERGE_CTX(step, rd) \
2315 MERGE(l, end_y, step, rd); \
2316 MERGE(a, end_x, step, rd); \
2319 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2320 for (n = 0, y = 0; y < end_y; y += step) { \
2321 for (x = 0; x < end_x; x += step, n += step * step) { \
2322 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2323 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2324 (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2325 c, e, p, a[x] + l[y], yscans[txtp], \
2326 ynbs[txtp], y_band_counts, qmul[0]); \
2327 a[x] = l[y] = !!res; \
2328 total_coeff |= !!res; \
2330 AV_WN16A(&s->eob[n], res); \
2337 #define SPLAT(la, end, step, cond) \
2339 for (n = 1; n < end; n += step) \
2340 la[n] = la[n - 1]; \
2341 } else if (step == 4) { \
2343 for (n = 0; n < end; n += step) \
2344 AV_WN32A(&la[n], la[n] * 0x01010101); \
2346 for (n = 0; n < end; n += step) \
2347 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2349 } else /* step == 8 */ { \
2351 if (HAVE_FAST_64BIT) { \
2352 for (n = 0; n < end; n += step) \
2353 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2355 for (n = 0; n < end; n += step) { \
2356 uint32_t v32 = la[n] * 0x01010101; \
2357 AV_WN32A(&la[n], v32); \
2358 AV_WN32A(&la[n + 4], v32); \
2362 for (n = 0; n < end; n += step) \
2363 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2366 #define SPLAT_CTX(step) \
2368 SPLAT(a, end_x, step, end_x == w4); \
2369 SPLAT(l, end_y, step, end_y == h4); \
2375 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2378 MERGE_CTX(2, AV_RN16A);
2379 DECODE_Y_COEF_LOOP(2, 0,);
2383 MERGE_CTX(4, AV_RN32A);
2384 DECODE_Y_COEF_LOOP(4, 0,);
2388 MERGE_CTX(8, AV_RN64A);
2389 DECODE_Y_COEF_LOOP(8, 0, 32);
2394 #define DECODE_UV_COEF_LOOP(step, v) \
2395 for (n = 0, y = 0; y < end_y; y += step) { \
2396 for (x = 0; x < end_x; x += step, n += step * step) { \
2397 res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2398 (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2399 16 * step * step, c, e, p, a[x] + l[y], \
2400 uvscan, uvnb, uv_band_counts, qmul[1]); \
2401 a[x] = l[y] = !!res; \
2402 total_coeff |= !!res; \
2404 AV_WN16A(&s->uveob[pl][n], res); \
2406 s->uveob[pl][n] = res; \
2411 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2412 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2413 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2418 for (pl = 0; pl < 2; pl++) {
2419 a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2420 l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2423 DECODE_UV_COEF_LOOP(1,);
2426 MERGE_CTX(2, AV_RN16A);
2427 DECODE_UV_COEF_LOOP(2,);
2431 MERGE_CTX(4, AV_RN32A);
2432 DECODE_UV_COEF_LOOP(4,);
2436 MERGE_CTX(8, AV_RN64A);
2437 DECODE_UV_COEF_LOOP(8, 32);
2446 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2448 return decode_coeffs(ctx, 1);
2451 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2453 return decode_coeffs(ctx, 0);
2456 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2457 uint8_t *dst_edge, ptrdiff_t stride_edge,
2458 uint8_t *dst_inner, ptrdiff_t stride_inner,
2459 uint8_t *l, int col, int x, int w,
2460 int row, int y, enum TxfmMode tx,
2461 int p, int ss_h, int ss_v, int bytesperpixel)
2463 int have_top = row > 0 || y > 0;
2464 int have_left = col > s->tile_col_start || x > 0;
2465 int have_right = x < w - 1;
2467 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2468 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2469 { DC_127_PRED, VERT_PRED } },
2470 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2471 { HOR_PRED, HOR_PRED } },
2472 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2473 { LEFT_DC_PRED, DC_PRED } },
2474 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2475 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2476 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2477 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2478 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2479 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2480 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2481 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2482 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2483 { DC_127_PRED, VERT_LEFT_PRED } },
2484 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2485 { HOR_UP_PRED, HOR_UP_PRED } },
2486 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2487 { HOR_PRED, TM_VP8_PRED } },
2489 static const struct {
2490 uint8_t needs_left:1;
2491 uint8_t needs_top:1;
2492 uint8_t needs_topleft:1;
2493 uint8_t needs_topright:1;
2494 uint8_t invert_left:1;
2495 } edges[N_INTRA_PRED_MODES] = {
2496 [VERT_PRED] = { .needs_top = 1 },
2497 [HOR_PRED] = { .needs_left = 1 },
2498 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2499 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2500 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2501 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2502 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2503 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2504 [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2505 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2506 [LEFT_DC_PRED] = { .needs_left = 1 },
2507 [TOP_DC_PRED] = { .needs_top = 1 },
2508 [DC_128_PRED] = { 0 },
2509 [DC_127_PRED] = { 0 },
2510 [DC_129_PRED] = { 0 }
2513 av_assert2(mode >= 0 && mode < 10);
2514 mode = mode_conv[mode][have_left][have_top];
2515 if (edges[mode].needs_top) {
2516 uint8_t *top, *topleft;
2517 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2518 int n_px_need_tr = 0;
2520 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2523 // if top of sb64-row, use s->intra_pred_data[] instead of
2524 // dst[-stride] for intra prediction (it contains pre- instead of
2525 // post-loopfilter data)
2527 top = !(row & 7) && !y ?
2528 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2529 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2531 topleft = !(row & 7) && !y ?
2532 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2533 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2534 &dst_inner[-stride_inner];
2538 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2539 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2540 n_px_need + n_px_need_tr <= n_px_have) {
2544 if (n_px_need <= n_px_have) {
2545 memcpy(*a, top, n_px_need * bytesperpixel);
2547 #define memset_bpp(c, i1, v, i2, num) do { \
2548 if (bytesperpixel == 1) { \
2549 memset(&(c)[(i1)], (v)[(i2)], (num)); \
2551 int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2552 for (n = 0; n < (num); n++) { \
2553 AV_WN16A(&(c)[((i1) + n) * 2], val); \
2557 memcpy(*a, top, n_px_have * bytesperpixel);
2558 memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2561 #define memset_val(c, val, num) do { \
2562 if (bytesperpixel == 1) { \
2563 memset((c), (val), (num)); \
2566 for (n = 0; n < (num); n++) { \
2567 AV_WN16A(&(c)[n * 2], (val)); \
2571 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2573 if (edges[mode].needs_topleft) {
2574 if (have_left && have_top) {
2575 #define assign_bpp(c, i1, v, i2) do { \
2576 if (bytesperpixel == 1) { \
2577 (c)[(i1)] = (v)[(i2)]; \
2579 AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2582 assign_bpp(*a, -1, topleft, -1);
2584 #define assign_val(c, i, v) do { \
2585 if (bytesperpixel == 1) { \
2588 AV_WN16A(&(c)[(i) * 2], (v)); \
2591 assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2594 if (tx == TX_4X4 && edges[mode].needs_topright) {
2595 if (have_top && have_right &&
2596 n_px_need + n_px_need_tr <= n_px_have) {
2597 memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2599 memset_bpp(*a, 4, *a, 3, 4);
2604 if (edges[mode].needs_left) {
2606 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2607 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2608 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2610 if (edges[mode].invert_left) {
2611 if (n_px_need <= n_px_have) {
2612 for (i = 0; i < n_px_need; i++)
2613 assign_bpp(l, i, &dst[i * stride], -1);
2615 for (i = 0; i < n_px_have; i++)
2616 assign_bpp(l, i, &dst[i * stride], -1);
2617 memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2620 if (n_px_need <= n_px_have) {
2621 for (i = 0; i < n_px_need; i++)
2622 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2624 for (i = 0; i < n_px_have; i++)
2625 assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2626 memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2630 memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2637 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2638 ptrdiff_t uv_off, int bytesperpixel)
2640 VP9Context *s = ctx->priv_data;
2642 int row = s->row, col = s->col;
2643 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2644 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2645 int end_x = FFMIN(2 * (s->cols - col), w4);
2646 int end_y = FFMIN(2 * (s->rows - row), h4);
2647 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2648 int uvstep1d = 1 << b->uvtx, p;
2649 uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2650 LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2651 LOCAL_ALIGNED_32(uint8_t, l, [64]);
2653 for (n = 0, y = 0; y < end_y; y += step1d) {
2654 uint8_t *ptr = dst, *ptr_r = dst_r;
2655 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2656 ptr_r += 4 * step1d * bytesperpixel, n += step) {
2657 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2659 uint8_t *a = &a_buf[32];
2660 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2661 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2663 mode = check_intra_mode(s, mode, &a, ptr_r,
2664 s->s.frames[CUR_FRAME].tf.f->linesize[0],
2665 ptr, s->y_stride, l,
2666 col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2667 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2669 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2670 s->block + 16 * n * bytesperpixel, eob);
2672 dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2673 dst += 4 * step1d * s->y_stride;
2680 step = 1 << (b->uvtx * 2);
2681 for (p = 0; p < 2; p++) {
2682 dst = s->dst[1 + p];
2683 dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2684 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2685 uint8_t *ptr = dst, *ptr_r = dst_r;
2686 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2687 ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2688 int mode = b->uvmode;
2689 uint8_t *a = &a_buf[32];
2690 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2692 mode = check_intra_mode(s, mode, &a, ptr_r,
2693 s->s.frames[CUR_FRAME].tf.f->linesize[1],
2694 ptr, s->uv_stride, l, col, x, w4, row, y,
2695 b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2696 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2698 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2699 s->uvblock[p] + 16 * n * bytesperpixel, eob);
2701 dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2702 dst += 4 * uvstep1d * s->uv_stride;
2707 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2709 intra_recon(ctx, y_off, uv_off, 1);
2712 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2714 intra_recon(ctx, y_off, uv_off, 2);
2717 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2718 uint8_t *dst, ptrdiff_t dst_stride,
2719 const uint8_t *ref, ptrdiff_t ref_stride,
2720 ThreadFrame *ref_frame,
2721 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2722 int bw, int bh, int w, int h, int bytesperpixel)
2724 int mx = mv->x, my = mv->y, th;
2728 ref += y * ref_stride + x * bytesperpixel;
2731 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2732 // we use +7 because the last 7 pixels of each sbrow can be changed in
2733 // the longest loopfilter of the next sbrow
2734 th = (y + bh + 4 * !!my + 7) >> 6;
2735 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2736 if (x < !!mx * 3 || y < !!my * 3 ||
2737 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2738 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2739 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2741 bw + !!mx * 7, bh + !!my * 7,
2742 x - !!mx * 3, y - !!my * 3, w, h);
2743 ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2746 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2749 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2750 uint8_t *dst_u, uint8_t *dst_v,
2751 ptrdiff_t dst_stride,
2752 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2753 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2754 ThreadFrame *ref_frame,
2755 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2756 int bw, int bh, int w, int h, int bytesperpixel)
2758 int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2762 ref_u += y * src_stride_u + x * bytesperpixel;
2763 ref_v += y * src_stride_v + x * bytesperpixel;
2766 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2767 // we use +7 because the last 7 pixels of each sbrow can be changed in
2768 // the longest loopfilter of the next sbrow
2769 th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2770 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2771 if (x < !!mx * 3 || y < !!my * 3 ||
2772 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2773 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2774 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2776 bw + !!mx * 7, bh + !!my * 7,
2777 x - !!mx * 3, y - !!my * 3, w, h);
2778 ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2779 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2781 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2782 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2784 bw + !!mx * 7, bh + !!my * 7,
2785 x - !!mx * 3, y - !!my * 3, w, h);
2786 ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2787 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2789 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2790 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2794 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2795 px, py, pw, ph, bw, bh, w, h, i) \
2796 mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2797 mv, bw, bh, w, h, bytesperpixel)
2798 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2799 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2800 mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2801 row, col, mv, bw, bh, w, h, bytesperpixel)
2803 #define FN(x) x##_8bpp
2804 #define BYTES_PER_PIXEL 1
2805 #include "vp9_mc_template.c"
2807 #undef BYTES_PER_PIXEL
2808 #define FN(x) x##_16bpp
2809 #define BYTES_PER_PIXEL 2
2810 #include "vp9_mc_template.c"
2812 #undef mc_chroma_dir
2814 #undef BYTES_PER_PIXEL
2817 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2818 vp9_mc_func (*mc)[2],
2819 uint8_t *dst, ptrdiff_t dst_stride,
2820 const uint8_t *ref, ptrdiff_t ref_stride,
2821 ThreadFrame *ref_frame,
2822 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2823 int px, int py, int pw, int ph,
2824 int bw, int bh, int w, int h, int bytesperpixel,
2825 const uint16_t *scale, const uint8_t *step)
2827 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2828 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2829 mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2830 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2832 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2834 int refbw_m1, refbh_m1;
2838 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2839 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2840 // BUG libvpx seems to scale the two components separately. This introduces
2841 // rounding errors but we have to reproduce them to be exactly compatible
2842 // with the output from libvpx...
2843 mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2844 my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2848 ref += y * ref_stride + x * bytesperpixel;
2851 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2852 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2853 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2854 // we use +7 because the last 7 pixels of each sbrow can be changed in
2855 // the longest loopfilter of the next sbrow
2856 th = (y + refbh_m1 + 4 + 7) >> 6;
2857 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2858 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2859 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2860 ref - 3 * ref_stride - 3 * bytesperpixel,
2862 refbw_m1 + 8, refbh_m1 + 8,
2863 x - 3, y - 3, w, h);
2864 ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2867 smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2871 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2872 vp9_mc_func (*mc)[2],
2873 uint8_t *dst_u, uint8_t *dst_v,
2874 ptrdiff_t dst_stride,
2875 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2876 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2877 ThreadFrame *ref_frame,
2878 ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2879 int px, int py, int pw, int ph,
2880 int bw, int bh, int w, int h, int bytesperpixel,
2881 const uint16_t *scale, const uint8_t *step)
2883 if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2884 s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2885 mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2886 ref_v, src_stride_v, ref_frame,
2887 y, x, in_mv, bw, bh, w, h, bytesperpixel);
2890 int refbw_m1, refbh_m1;
2895 // BUG https://code.google.com/p/webm/issues/detail?id=820
2896 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2897 mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2899 mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2900 mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2903 // BUG https://code.google.com/p/webm/issues/detail?id=820
2904 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2905 my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2907 mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2908 my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2913 ref_u += y * src_stride_u + x * bytesperpixel;
2914 ref_v += y * src_stride_v + x * bytesperpixel;
2917 refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2918 refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2919 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2920 // we use +7 because the last 7 pixels of each sbrow can be changed in
2921 // the longest loopfilter of the next sbrow
2922 th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2923 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2924 if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2925 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2926 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2928 refbw_m1 + 8, refbh_m1 + 8,
2929 x - 3, y - 3, w, h);
2930 ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2931 smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2933 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2934 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2936 refbw_m1 + 8, refbh_m1 + 8,
2937 x - 3, y - 3, w, h);
2938 ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2939 smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2941 smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2942 smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2947 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2948 px, py, pw, ph, bw, bh, w, h, i) \
2949 mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2950 mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2951 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2952 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2953 row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2954 mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2955 row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2956 s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2958 #define FN(x) x##_scaled_8bpp
2959 #define BYTES_PER_PIXEL 1
2960 #include "vp9_mc_template.c"
2962 #undef BYTES_PER_PIXEL
2963 #define FN(x) x##_scaled_16bpp
2964 #define BYTES_PER_PIXEL 2
2965 #include "vp9_mc_template.c"
2967 #undef mc_chroma_dir
2969 #undef BYTES_PER_PIXEL
2972 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2974 VP9Context *s = ctx->priv_data;
2976 int row = s->row, col = s->col;
2978 if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2979 if (bytesperpixel == 1) {
2980 inter_pred_scaled_8bpp(ctx);
2982 inter_pred_scaled_16bpp(ctx);
2985 if (bytesperpixel == 1) {
2986 inter_pred_8bpp(ctx);
2988 inter_pred_16bpp(ctx);
2992 /* mostly copied intra_recon() */
2994 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2995 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2996 int end_x = FFMIN(2 * (s->cols - col), w4);
2997 int end_y = FFMIN(2 * (s->rows - row), h4);
2998 int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2999 int uvstep1d = 1 << b->uvtx, p;
3000 uint8_t *dst = s->dst[0];
3003 for (n = 0, y = 0; y < end_y; y += step1d) {
3005 for (x = 0; x < end_x; x += step1d,
3006 ptr += 4 * step1d * bytesperpixel, n += step) {
3007 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3010 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3011 s->block + 16 * n * bytesperpixel, eob);
3013 dst += 4 * s->y_stride * step1d;
3019 step = 1 << (b->uvtx * 2);
3020 for (p = 0; p < 2; p++) {
3021 dst = s->dst[p + 1];
3022 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3024 for (x = 0; x < end_x; x += uvstep1d,
3025 ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3026 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3029 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3030 s->uvblock[p] + 16 * n * bytesperpixel, eob);
3032 dst += 4 * uvstep1d * s->uv_stride;
3038 static void inter_recon_8bpp(AVCodecContext *ctx)
3040 inter_recon(ctx, 1);
3043 static void inter_recon_16bpp(AVCodecContext *ctx)
3045 inter_recon(ctx, 2);
3048 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3049 int row_and_7, int col_and_7,
3050 int w, int h, int col_end, int row_end,
3051 enum TxfmMode tx, int skip_inter)
3053 static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3054 static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3056 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3057 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3058 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3059 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3061 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3062 // edges. This means that for UV, we work on two subsampled blocks at
3063 // a time, and we only use the topleft block's mode information to set
3064 // things like block strength. Thus, for any block size smaller than
3065 // 16x16, ignore the odd portion of the block.
3066 if (tx == TX_4X4 && (ss_v | ss_h)) {
3081 if (tx == TX_4X4 && !skip_inter) {
3082 int t = 1 << col_and_7, m_col = (t << w) - t, y;
3083 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3084 int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3086 for (y = row_and_7; y < h + row_and_7; y++) {
3087 int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3089 mask[0][y][1] |= m_row_8;
3090 mask[0][y][2] |= m_row_4;
3091 // for odd lines, if the odd col is not being filtered,
3092 // skip odd row also:
3099 // if a/c are even row/col and b/d are odd, and d is skipped,
3100 // e.g. right edge of size-66x66.webm, then skip b also (bug)
3101 if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3102 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3104 mask[1][y][col_mask_id] |= m_col;
3107 mask[0][y][3] |= m_col;
3109 if (ss_h && (col_end & 1))
3110 mask[1][y][3] |= (t << (w - 1)) - t;
3112 mask[1][y][3] |= m_col;
3116 int y, t = 1 << col_and_7, m_col = (t << w) - t;
3119 int mask_id = (tx == TX_8X8);
3120 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3121 int l2 = tx + ss_h - 1, step1d;
3122 int m_row = m_col & masks[l2];
3124 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3125 // 8wd loopfilter to prevent going off the visible edge.
3126 if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3127 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3128 int m_row_8 = m_row - m_row_16;
3130 for (y = row_and_7; y < h + row_and_7; y++) {
3131 mask[0][y][0] |= m_row_16;
3132 mask[0][y][1] |= m_row_8;
3135 for (y = row_and_7; y < h + row_and_7; y++)
3136 mask[0][y][mask_id] |= m_row;
3141 if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3142 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3143 mask[1][y][0] |= m_col;
3144 if (y - row_and_7 == h - 1)
3145 mask[1][y][1] |= m_col;
3147 for (y = row_and_7; y < h + row_and_7; y += step1d)
3148 mask[1][y][mask_id] |= m_col;
3150 } else if (tx != TX_4X4) {
3153 mask_id = (tx == TX_8X8) || (h == ss_v);
3154 mask[1][row_and_7][mask_id] |= m_col;
3155 mask_id = (tx == TX_8X8) || (w == ss_h);
3156 for (y = row_and_7; y < h + row_and_7; y++)
3157 mask[0][y][mask_id] |= t;
3159 int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3161 for (y = row_and_7; y < h + row_and_7; y++) {
3162 mask[0][y][2] |= t4;
3163 mask[0][y][1] |= t8;
3165 mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3170 static void decode_b(AVCodecContext *ctx, int row, int col,
3171 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3172 enum BlockLevel bl, enum BlockPartition bp)
3174 VP9Context *s = ctx->priv_data;
3176 enum BlockSize bs = bl * 3 + bp;
3177 int bytesperpixel = s->bytesperpixel;
3178 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3180 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3186 s->min_mv.x = -(128 + col * 64);
3187 s->min_mv.y = -(128 + row * 64);
3188 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3189 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3195 b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3196 (s->ss_v && h4 * 2 == (1 << b->tx)));
3201 if (bytesperpixel == 1) {
3202 has_coeffs = decode_coeffs_8bpp(ctx);
3204 has_coeffs = decode_coeffs_16bpp(ctx);
3206 if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3208 memset(&s->above_skip_ctx[col], 1, w4);
3209 memset(&s->left_skip_ctx[s->row7], 1, h4);
3214 #define SPLAT_ZERO_CTX(v, n) \
3216 case 1: v = 0; break; \
3217 case 2: AV_ZERO16(&v); break; \
3218 case 4: AV_ZERO32(&v); break; \
3219 case 8: AV_ZERO64(&v); break; \
3220 case 16: AV_ZERO128(&v); break; \
3222 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3224 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3225 if (s->ss_##dir2) { \
3226 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3227 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3229 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3230 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3235 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3236 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3237 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3238 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3241 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3242 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3243 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3244 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3250 s->block += w4 * h4 * 64 * bytesperpixel;
3251 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3252 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3253 s->eob += 4 * w4 * h4;
3254 s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3255 s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3261 // emulated overhangs if the stride of the target buffer can't hold. This
3262 // makes it possible to support emu-edge and so on even if we have large block
3264 emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3265 (row + h4) > s->rows;
3266 emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3267 (row + h4) > s->rows;
3269 s->dst[0] = s->tmp_y;
3272 s->dst[0] = f->data[0] + yoff;
3273 s->y_stride = f->linesize[0];
3276 s->dst[1] = s->tmp_uv[0];
3277 s->dst[2] = s->tmp_uv[1];
3280 s->dst[1] = f->data[1] + uvoff;
3281 s->dst[2] = f->data[2] + uvoff;
3282 s->uv_stride = f->linesize[1];
3286 intra_recon_16bpp(ctx, yoff, uvoff);
3288 intra_recon_8bpp(ctx, yoff, uvoff);
3292 inter_recon_16bpp(ctx);
3294 inter_recon_8bpp(ctx);
3298 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3300 for (n = 0; o < w; n++) {
3305 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3306 s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3312 int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3313 int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3315 for (n = s->ss_h; o < w; n++) {
3320 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3321 s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3322 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3323 s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3329 // pick filter level and find edges to apply filter to
3330 if (s->s.h.filter.level &&
3331 (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3332 [b->mode[3] != ZEROMV]) > 0) {
3333 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3334 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3336 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3337 mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3338 if (s->ss_h || s->ss_v)
3339 mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3340 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3341 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3342 b->uvtx, skip_inter);
3344 if (!s->filter_lut.lim_lut[lvl]) {
3345 int sharp = s->s.h.filter.sharpness;
3349 limit >>= (sharp + 3) >> 2;
3350 limit = FFMIN(limit, 9 - sharp);
3352 limit = FFMAX(limit, 1);
3354 s->filter_lut.lim_lut[lvl] = limit;
3355 s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3361 s->block += w4 * h4 * 64 * bytesperpixel;
3362 s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3363 s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3364 s->eob += 4 * w4 * h4;
3365 s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3366 s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3370 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3371 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3373 VP9Context *s = ctx->priv_data;
3374 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3375 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3376 const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3377 s->prob.p.partition[bl][c];
3378 enum BlockPartition bp;
3379 ptrdiff_t hbs = 4 >> bl;
3380 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3381 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3382 int bytesperpixel = s->bytesperpixel;
3385 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3386 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3387 } else if (col + hbs < s->cols) { // FIXME why not <=?
3388 if (row + hbs < s->rows) { // FIXME why not <=?
3389 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3391 case PARTITION_NONE:
3392 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3395 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3396 yoff += hbs * 8 * y_stride;
3397 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3398 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3401 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3402 yoff += hbs * 8 * bytesperpixel;
3403 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3404 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3406 case PARTITION_SPLIT:
3407 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3408 decode_sb(ctx, row, col + hbs, lflvl,
3409 yoff + 8 * hbs * bytesperpixel,
3410 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3411 yoff += hbs * 8 * y_stride;
3412 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3413 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3414 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3415 yoff + 8 * hbs * bytesperpixel,
3416 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3421 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3422 bp = PARTITION_SPLIT;
3423 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3424 decode_sb(ctx, row, col + hbs, lflvl,
3425 yoff + 8 * hbs * bytesperpixel,
3426 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3429 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3431 } else if (row + hbs < s->rows) { // FIXME why not <=?
3432 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3433 bp = PARTITION_SPLIT;
3434 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3435 yoff += hbs * 8 * y_stride;
3436 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3437 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3440 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3443 bp = PARTITION_SPLIT;
3444 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3446 s->counts.partition[bl][c][bp]++;
3449 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3450 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3452 VP9Context *s = ctx->priv_data;
3454 ptrdiff_t hbs = 4 >> bl;
3455 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3456 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3457 int bytesperpixel = s->bytesperpixel;
3460 av_assert2(b->bl == BL_8X8);
3461 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3462 } else if (s->b->bl == bl) {
3463 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3464 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3465 yoff += hbs * 8 * y_stride;
3466 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3467 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3468 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3469 yoff += hbs * 8 * bytesperpixel;
3470 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3471 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3474 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3475 if (col + hbs < s->cols) { // FIXME why not <=?
3476 if (row + hbs < s->rows) {
3477 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3478 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3479 yoff += hbs * 8 * y_stride;
3480 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3481 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3482 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3483 yoff + 8 * hbs * bytesperpixel,
3484 uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3486 yoff += hbs * 8 * bytesperpixel;
3487 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3488 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3490 } else if (row + hbs < s->rows) {
3491 yoff += hbs * 8 * y_stride;
3492 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3493 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3498 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3499 uint8_t *lvl, uint8_t (*mask)[4],
3500 uint8_t *dst, ptrdiff_t ls)
3502 int y, x, bytesperpixel = s->bytesperpixel;
3504 // filter edges between columns (e.g. block1 | block2)
3505 for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3506 uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3507 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3508 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3509 unsigned hm = hm1 | hm2 | hm13 | hm23;
3511 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3514 int L = *l, H = L >> 4;
3515 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3517 if (hmask1[0] & x) {
3518 if (hmask2[0] & x) {
3519 av_assert2(l[8 << ss_v] == L);
3520 s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3522 s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3524 } else if (hm2 & x) {
3527 E |= s->filter_lut.mblim_lut[L] << 8;
3528 I |= s->filter_lut.lim_lut[L] << 8;
3529 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3531 [0](ptr, ls, E, I, H);
3533 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3534 [0](ptr, ls, E, I, H);
3536 } else if (hm2 & x) {
3537 int L = l[8 << ss_v], H = L >> 4;
3538 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3540 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3541 [0](ptr + 8 * ls, ls, E, I, H);
3549 int L = *l, H = L >> 4;
3550 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3555 E |= s->filter_lut.mblim_lut[L] << 8;
3556 I |= s->filter_lut.lim_lut[L] << 8;
3557 s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3559 s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3561 } else if (hm23 & x) {
3562 int L = l[8 << ss_v], H = L >> 4;
3563 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3565 s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3573 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3574 uint8_t *lvl, uint8_t (*mask)[4],
3575 uint8_t *dst, ptrdiff_t ls)
3577 int y, x, bytesperpixel = s->bytesperpixel;
3580 // filter edges between rows (e.g. ------)
3582 for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3583 uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3584 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3586 for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3589 int L = *l, H = L >> 4;
3590 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3593 if (vmask[0] & (x << (1 + ss_h))) {
3594 av_assert2(l[1 + ss_h] == L);
3595 s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3597 s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3599 } else if (vm & (x << (1 + ss_h))) {
3602 E |= s->filter_lut.mblim_lut[L] << 8;
3603 I |= s->filter_lut.lim_lut[L] << 8;
3604 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3605 [!!(vmask[1] & (x << (1 + ss_h)))]
3606 [1](ptr, ls, E, I, H);
3608 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3609 [1](ptr, ls, E, I, H);
3611 } else if (vm & (x << (1 + ss_h))) {
3612 int L = l[1 + ss_h], H = L >> 4;
3613 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3615 s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3616 [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3621 int L = *l, H = L >> 4;
3622 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3624 if (vm3 & (x << (1 + ss_h))) {
3627 E |= s->filter_lut.mblim_lut[L] << 8;
3628 I |= s->filter_lut.lim_lut[L] << 8;
3629 s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3631 s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3633 } else if (vm3 & (x << (1 + ss_h))) {
3634 int L = l[1 + ss_h], H = L >> 4;
3635 int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3637 s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3650 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3651 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3653 VP9Context *s = ctx->priv_data;
3654 AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3655 uint8_t *dst = f->data[0] + yoff;
3656 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3657 uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3660 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3661 // if you think of them as acting on a 8x8 block max, we can interleave
3662 // each v/h within the single x loop, but that only works if we work on
3663 // 8 pixel blocks, and we won't always do that (we want at least 16px
3664 // to use SSE2 optimizations, perhaps 32 for AVX2)
3666 filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3667 filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3669 for (p = 0; p < 2; p++) {
3670 dst = f->data[1 + p] + uvoff;
3671 filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3672 filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3676 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3678 int sb_start = ( idx * n) >> log2_n;
3679 int sb_end = ((idx + 1) * n) >> log2_n;
3680 *start = FFMIN(sb_start, n) << 3;
3681 *end = FFMIN(sb_end, n) << 3;
3684 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3685 int max_count, int update_factor)
3687 unsigned ct = ct0 + ct1, p2, p1;
3693 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3694 p2 = av_clip(p2, 1, 255);
3695 ct = FFMIN(ct, max_count);
3696 update_factor = FASTDIV(update_factor * ct, max_count);
3698 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3699 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3702 static void adapt_probs(VP9Context *s)
3705 prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3706 int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3709 for (i = 0; i < 4; i++)
3710 for (j = 0; j < 2; j++)
3711 for (k = 0; k < 2; k++)
3712 for (l = 0; l < 6; l++)
3713 for (m = 0; m < 6; m++) {
3714 uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3715 unsigned *e = s->counts.eob[i][j][k][l][m];
3716 unsigned *c = s->counts.coef[i][j][k][l][m];
3718 if (l == 0 && m >= 3) // dc only has 3 pt
3721 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3722 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3723 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3726 if (s->s.h.keyframe || s->s.h.intraonly) {
3727 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3728 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3729 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3730 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3735 for (i = 0; i < 3; i++)
3736 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3739 for (i = 0; i < 4; i++)
3740 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3743 if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3744 for (i = 0; i < 5; i++)
3745 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3749 if (s->s.h.comppredmode != PRED_SINGLEREF) {
3750 for (i = 0; i < 5; i++)
3751 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3752 s->counts.comp_ref[i][1], 20, 128);
3755 if (s->s.h.comppredmode != PRED_COMPREF) {
3756 for (i = 0; i < 5; i++) {
3757 uint8_t *pp = p->single_ref[i];
3758 unsigned (*c)[2] = s->counts.single_ref[i];
3760 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3761 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3765 // block partitioning
3766 for (i = 0; i < 4; i++)
3767 for (j = 0; j < 4; j++) {
3768 uint8_t *pp = p->partition[i][j];
3769 unsigned *c = s->counts.partition[i][j];
3771 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3772 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3773 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3777 if (s->s.h.txfmmode == TX_SWITCHABLE) {
3778 for (i = 0; i < 2; i++) {
3779 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3781 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3782 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3783 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3784 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3785 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3786 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3790 // interpolation filter
3791 if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3792 for (i = 0; i < 4; i++) {
3793 uint8_t *pp = p->filter[i];
3794 unsigned *c = s->counts.filter[i];
3796 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3797 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3802 for (i = 0; i < 7; i++) {
3803 uint8_t *pp = p->mv_mode[i];
3804 unsigned *c = s->counts.mv_mode[i];
3806 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3807 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3808 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3813 uint8_t *pp = p->mv_joint;
3814 unsigned *c = s->counts.mv_joint;
3816 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3817 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3818 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3822 for (i = 0; i < 2; i++) {
3824 unsigned *c, (*c2)[2], sum;
3826 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3827 s->counts.mv_comp[i].sign[1], 20, 128);
3829 pp = p->mv_comp[i].classes;
3830 c = s->counts.mv_comp[i].classes;
3831 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3832 adapt_prob(&pp[0], c[0], sum, 20, 128);
3834 adapt_prob(&pp[1], c[1], sum, 20, 128);
3836 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3837 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3839 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3840 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3842 adapt_prob(&pp[6], c[6], sum, 20, 128);
3843 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3844 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3845 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3847 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3848 s->counts.mv_comp[i].class0[1], 20, 128);
3849 pp = p->mv_comp[i].bits;
3850 c2 = s->counts.mv_comp[i].bits;
3851 for (j = 0; j < 10; j++)
3852 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3854 for (j = 0; j < 2; j++) {
3855 pp = p->mv_comp[i].class0_fp[j];
3856 c = s->counts.mv_comp[i].class0_fp[j];
3857 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3858 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3859 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3861 pp = p->mv_comp[i].fp;
3862 c = s->counts.mv_comp[i].fp;
3863 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3864 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3865 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3867 if (s->s.h.highprecisionmvs) {
3868 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3869 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3870 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3871 s->counts.mv_comp[i].hp[1], 20, 128);
3876 for (i = 0; i < 4; i++) {
3877 uint8_t *pp = p->y_mode[i];
3878 unsigned *c = s->counts.y_mode[i], sum, s2;
3880 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3881 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3882 sum -= c[TM_VP8_PRED];
3883 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3884 sum -= c[VERT_PRED];
3885 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3886 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3888 adapt_prob(&pp[3], s2, sum, 20, 128);
3890 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3891 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3892 sum -= c[DIAG_DOWN_LEFT_PRED];
3893 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3894 sum -= c[VERT_LEFT_PRED];
3895 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3896 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3900 for (i = 0; i < 10; i++) {
3901 uint8_t *pp = p->uv_mode[i];
3902 unsigned *c = s->counts.uv_mode[i], sum, s2;
3904 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3905 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3906 sum -= c[TM_VP8_PRED];
3907 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3908 sum -= c[VERT_PRED];
3909 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3910 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3912 adapt_prob(&pp[3], s2, sum, 20, 128);
3914 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3915 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3916 sum -= c[DIAG_DOWN_LEFT_PRED];
3917 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3918 sum -= c[VERT_LEFT_PRED];
3919 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3920 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3924 static void free_buffers(VP9Context *s)
3926 av_freep(&s->intra_pred_data[0]);
3927 av_freep(&s->b_base);
3928 av_freep(&s->block_base);
3931 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3933 VP9Context *s = ctx->priv_data;
3936 for (i = 0; i < 3; i++) {
3937 if (s->s.frames[i].tf.f->buf[0])
3938 vp9_unref_frame(ctx, &s->s.frames[i]);
3939 av_frame_free(&s->s.frames[i].tf.f);
3941 for (i = 0; i < 8; i++) {
3942 if (s->s.refs[i].f->buf[0])
3943 ff_thread_release_buffer(ctx, &s->s.refs[i]);
3944 av_frame_free(&s->s.refs[i].f);
3945 if (s->next_refs[i].f->buf[0])
3946 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3947 av_frame_free(&s->next_refs[i].f);
3957 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3958 int *got_frame, AVPacket *pkt)
3960 const uint8_t *data = pkt->data;
3961 int size = pkt->size;
3962 VP9Context *s = ctx->priv_data;
3963 int res, tile_row, tile_col, i, ref, row, col;
3964 int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3965 (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3966 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3970 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3972 } else if (res == 0) {
3973 if (!s->s.refs[ref].f->buf[0]) {
3974 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3975 return AVERROR_INVALIDDATA;
3977 if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3979 ((AVFrame *)frame)->pkt_pts = pkt->pts;
3980 ((AVFrame *)frame)->pkt_dts = pkt->dts;
3981 for (i = 0; i < 8; i++) {
3982 if (s->next_refs[i].f->buf[0])
3983 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3984 if (s->s.refs[i].f->buf[0] &&
3985 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
3994 if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
3995 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
3996 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
3997 if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
3998 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4001 if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4002 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
4003 if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4004 (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4006 if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4007 vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4008 if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4010 f = s->s.frames[CUR_FRAME].tf.f;
4011 f->key_frame = s->s.h.keyframe;
4012 f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4013 ls_y = f->linesize[0];
4014 ls_uv =f->linesize[1];
4016 if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4017 (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
4018 s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
4019 vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
4023 for (i = 0; i < 8; i++) {
4024 if (s->next_refs[i].f->buf[0])
4025 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4026 if (s->s.h.refreshrefmask & (1 << i)) {
4027 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4028 } else if (s->s.refs[i].f->buf[0]) {
4029 res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4036 res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4039 res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4042 res = ctx->hwaccel->end_frame(ctx);
4048 // main tile decode loop
4049 bytesperpixel = s->bytesperpixel;
4050 memset(s->above_partition_ctx, 0, s->cols);
4051 memset(s->above_skip_ctx, 0, s->cols);
4052 if (s->s.h.keyframe || s->s.h.intraonly) {
4053 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4055 memset(s->above_mode_ctx, NEARESTMV, s->cols);
4057 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4058 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4059 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4060 memset(s->above_segpred_ctx, 0, s->cols);
4061 s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4062 ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
4063 if ((res = update_block_buffers(ctx)) < 0) {
4064 av_log(ctx, AV_LOG_ERROR,
4065 "Failed to allocate block buffers\n");
4068 if (s->s.h.refreshctx && s->s.h.parallelmode) {
4071 for (i = 0; i < 4; i++) {
4072 for (j = 0; j < 2; j++)
4073 for (k = 0; k < 2; k++)
4074 for (l = 0; l < 6; l++)
4075 for (m = 0; m < 6; m++)
4076 memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4077 s->prob.coef[i][j][k][l][m], 3);
4078 if (s->s.h.txfmmode == i)
4081 s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4082 ff_thread_finish_setup(ctx);
4083 } else if (!s->s.h.refreshctx) {
4084 ff_thread_finish_setup(ctx);
4090 s->block = s->block_base;
4091 s->uvblock[0] = s->uvblock_base[0];
4092 s->uvblock[1] = s->uvblock_base[1];
4093 s->eob = s->eob_base;
4094 s->uveob[0] = s->uveob_base[0];
4095 s->uveob[1] = s->uveob_base[1];
4097 for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4098 set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4099 tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4101 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4104 if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4105 tile_row == s->s.h.tiling.tile_rows - 1) {
4108 tile_size = AV_RB32(data);
4112 if (tile_size > size) {
4113 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4114 return AVERROR_INVALIDDATA;
4116 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4117 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4118 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4119 return AVERROR_INVALIDDATA;
4126 for (row = s->tile_row_start; row < s->tile_row_end;
4127 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4128 struct VP9Filter *lflvl_ptr = s->lflvl;
4129 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4131 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4132 set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4133 tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4136 memset(s->left_partition_ctx, 0, 8);
4137 memset(s->left_skip_ctx, 0, 8);
4138 if (s->s.h.keyframe || s->s.h.intraonly) {
4139 memset(s->left_mode_ctx, DC_PRED, 16);
4141 memset(s->left_mode_ctx, NEARESTMV, 8);
4143 memset(s->left_y_nnz_ctx, 0, 16);
4144 memset(s->left_uv_nnz_ctx, 0, 32);
4145 memset(s->left_segpred_ctx, 0, 8);
4147 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4150 for (col = s->tile_col_start;
4151 col < s->tile_col_end;
4152 col += 8, yoff2 += 64 * bytesperpixel,
4153 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4154 // FIXME integrate with lf code (i.e. zero after each
4155 // use, similar to invtxfm coefficients, or similar)
4157 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4161 decode_sb_mem(ctx, row, col, lflvl_ptr,
4162 yoff2, uvoff2, BL_64X64);
4164 decode_sb(ctx, row, col, lflvl_ptr,
4165 yoff2, uvoff2, BL_64X64);
4169 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4177 // backup pre-loopfilter reconstruction data for intra
4178 // prediction of next row of sb64s
4179 if (row + 8 < s->rows) {
4180 memcpy(s->intra_pred_data[0],
4181 f->data[0] + yoff + 63 * ls_y,
4182 8 * s->cols * bytesperpixel);
4183 memcpy(s->intra_pred_data[1],
4184 f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4185 8 * s->cols * bytesperpixel >> s->ss_h);
4186 memcpy(s->intra_pred_data[2],
4187 f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4188 8 * s->cols * bytesperpixel >> s->ss_h);
4191 // loopfilter one row
4192 if (s->s.h.filter.level) {
4195 lflvl_ptr = s->lflvl;
4196 for (col = 0; col < s->cols;
4197 col += 8, yoff2 += 64 * bytesperpixel,
4198 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4199 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4203 // FIXME maybe we can make this more finegrained by running the
4204 // loopfilter per-block instead of after each sbrow
4205 // In fact that would also make intra pred left preparation easier?
4206 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4210 if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4212 ff_thread_finish_setup(ctx);
4214 } while (s->pass++ == 1);
4215 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4219 for (i = 0; i < 8; i++) {
4220 if (s->s.refs[i].f->buf[0])
4221 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4222 if (s->next_refs[i].f->buf[0] &&
4223 (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4227 if (!s->s.h.invisible) {
4228 if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4236 static void vp9_decode_flush(AVCodecContext *ctx)
4238 VP9Context *s = ctx->priv_data;
4241 for (i = 0; i < 3; i++)
4242 vp9_unref_frame(ctx, &s->s.frames[i]);
4243 for (i = 0; i < 8; i++)
4244 ff_thread_release_buffer(ctx, &s->s.refs[i]);
4247 static int init_frames(AVCodecContext *ctx)
4249 VP9Context *s = ctx->priv_data;
4252 for (i = 0; i < 3; i++) {
4253 s->s.frames[i].tf.f = av_frame_alloc();
4254 if (!s->s.frames[i].tf.f) {
4255 vp9_decode_free(ctx);
4256 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4257 return AVERROR(ENOMEM);
4260 for (i = 0; i < 8; i++) {
4261 s->s.refs[i].f = av_frame_alloc();
4262 s->next_refs[i].f = av_frame_alloc();
4263 if (!s->s.refs[i].f || !s->next_refs[i].f) {
4264 vp9_decode_free(ctx);
4265 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4266 return AVERROR(ENOMEM);
4273 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4275 VP9Context *s = ctx->priv_data;
4277 ctx->internal->allocate_progress = 1;
4279 s->s.h.filter.sharpness = -1;
4281 return init_frames(ctx);
4285 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4287 return init_frames(avctx);
4290 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4293 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4295 // detect size changes in other threads
4296 if (s->intra_pred_data[0] &&
4297 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols ||
4298 s->rows != ssrc->rows || s->bpp != ssrc->bpp || s->pix_fmt != ssrc->pix_fmt)) {
4302 for (i = 0; i < 3; i++) {
4303 if (s->s.frames[i].tf.f->buf[0])
4304 vp9_unref_frame(dst, &s->s.frames[i]);
4305 if (ssrc->s.frames[i].tf.f->buf[0]) {
4306 if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4310 for (i = 0; i < 8; i++) {
4311 if (s->s.refs[i].f->buf[0])
4312 ff_thread_release_buffer(dst, &s->s.refs[i]);
4313 if (ssrc->next_refs[i].f->buf[0]) {
4314 if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4319 s->s.h.invisible = ssrc->s.h.invisible;
4320 s->s.h.keyframe = ssrc->s.h.keyframe;
4321 s->s.h.intraonly = ssrc->s.h.intraonly;
4322 s->ss_v = ssrc->ss_v;
4323 s->ss_h = ssrc->ss_h;
4324 s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4325 s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4326 s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4327 s->bytesperpixel = ssrc->bytesperpixel;
4329 s->bpp_index = ssrc->bpp_index;
4330 s->pix_fmt = ssrc->pix_fmt;
4331 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4332 memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4333 memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4334 sizeof(s->s.h.segmentation.feat));
4340 AVCodec ff_vp9_decoder = {
4342 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4343 .type = AVMEDIA_TYPE_VIDEO,
4344 .id = AV_CODEC_ID_VP9,
4345 .priv_data_size = sizeof(VP9Context),
4346 .init = vp9_decode_init,
4347 .close = vp9_decode_free,
4348 .decode = vp9_decode_frame,
4349 .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4350 .flush = vp9_decode_flush,
4351 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4352 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4353 .profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),