git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of FFmpeg.
  10  *
  11  * FFmpeg is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * FFmpeg is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with FFmpeg; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43 #if HAVE_THREADS
  44             pthread_cond_destroy(&s->thread_data[i].cond);
  45             pthread_mutex_destroy(&s->thread_data[i].lock);
  46 #endif
  47             av_freep(&s->thread_data[i].filter_strength);
  48         }
  49     av_freep(&s->thread_data);
  50     av_freep(&s->macroblocks_base);
  51     av_freep(&s->intra4x4_pred_mode_top);
  52     av_freep(&s->top_nnz);
  53     av_freep(&s->top_border);
  54
  55     s->macroblocks = NULL;
  56 }
  57
  58 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  59 {
  60     int ret;
  61     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  62                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  63         return ret;
  64     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  65         ff_thread_release_buffer(s->avctx, &f->tf);
  66         return AVERROR(ENOMEM);
  67     }
  68     return 0;
  69 }
  70
  71 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  72 {
  73     av_buffer_unref(&f->seg_map);
  74     ff_thread_release_buffer(s->avctx, &f->tf);
  75 }
  76
  77 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  78 {
  79     int ret;
  80
  81     vp8_release_frame(s, dst);
  82
  83     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  84         return ret;
  85     if (src->seg_map &&
  86         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  87         vp8_release_frame(s, dst);
  88         return AVERROR(ENOMEM);
  89     }
  90
  91     return 0;
  92 }
  93
  94
  95 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  96 {
  97     VP8Context *s = avctx->priv_data;
  98     int i;
  99
 100     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 101         vp8_release_frame(s, &s->frames[i]);
 102     memset(s->framep, 0, sizeof(s->framep));
 103
 104     if (free_mem)
 105         free_buffers(s);
 106 }
 107
 108 static void vp8_decode_flush(AVCodecContext *avctx)
 109 {
 110     vp8_decode_flush_impl(avctx, 0);
 111 }
 112
 113 static int update_dimensions(VP8Context *s, int width, int height)
 114 {
 115     AVCodecContext *avctx = s->avctx;
 116     int i;
 117
 118     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 119         height != s->avctx->height) {
 120         if (av_image_check_size(width, height, 0, s->avctx))
 121             return AVERROR_INVALIDDATA;
 122
 123         vp8_decode_flush_impl(s->avctx, 1);
 124
 125         avcodec_set_dimensions(s->avctx, width, height);
 126     }
 127
 128     s->mb_width  = (s->avctx->coded_width +15) / 16;
 129     s->mb_height = (s->avctx->coded_height+15) / 16;
 130
 131     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 132     if (!s->mb_layout) { // Frame threading and one thread
 133         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 134         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 135     }
 136     else // Sliced threading
 137         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 138     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 139     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 140     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 141
 142     for (i = 0; i < MAX_THREADS; i++) {
 143         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 144 #if HAVE_THREADS
 145         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 146         pthread_cond_init(&s->thread_data[i].cond, NULL);
 147 #endif
 148     }
 149
 150     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 151         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 152         return AVERROR(ENOMEM);
 153
 154     s->macroblocks        = s->macroblocks_base + 1;
 155
 156     return 0;
 157 }
 158
 159 static void parse_segment_info(VP8Context *s)
 160 {
 161     VP56RangeCoder *c = &s->c;
 162     int i;
 163
 164     s->segmentation.update_map = vp8_rac_get(c);
 165
 166     if (vp8_rac_get(c)) { // update segment feature data
 167         s->segmentation.absolute_vals = vp8_rac_get(c);
 168
 169         for (i = 0; i < 4; i++)
 170             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 171
 172         for (i = 0; i < 4; i++)
 173             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 174     }
 175     if (s->segmentation.update_map)
 176         for (i = 0; i < 3; i++)
 177             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 178 }
 179
 180 static void update_lf_deltas(VP8Context *s)
 181 {
 182     VP56RangeCoder *c = &s->c;
 183     int i;
 184
 185     for (i = 0; i < 4; i++) {
 186         if (vp8_rac_get(c)) {
 187             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 188
 189             if (vp8_rac_get(c))
 190                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 191         }
 192     }
 193
 194     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 195         if (vp8_rac_get(c)) {
 196             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 197
 198             if (vp8_rac_get(c))
 199                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 200         }
 201     }
 202 }
 203
 204 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 205 {
 206     const uint8_t *sizes = buf;
 207     int i;
 208
 209     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 210
 211     buf      += 3*(s->num_coeff_partitions-1);
 212     buf_size -= 3*(s->num_coeff_partitions-1);
 213     if (buf_size < 0)
 214         return -1;
 215
 216     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 217         int size = AV_RL24(sizes + 3*i);
 218         if (buf_size - size < 0)
 219             return -1;
 220
 221         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 222         buf      += size;
 223         buf_size -= size;
 224     }
 225     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 226
 227     return 0;
 228 }
 229
 230 static void get_quants(VP8Context *s)
 231 {
 232     VP56RangeCoder *c = &s->c;
 233     int i, base_qi;
 234
 235     int yac_qi     = vp8_rac_get_uint(c, 7);
 236     int ydc_delta  = vp8_rac_get_sint(c, 4);
 237     int y2dc_delta = vp8_rac_get_sint(c, 4);
 238     int y2ac_delta = vp8_rac_get_sint(c, 4);
 239     int uvdc_delta = vp8_rac_get_sint(c, 4);
 240     int uvac_delta = vp8_rac_get_sint(c, 4);
 241
 242     for (i = 0; i < 4; i++) {
 243         if (s->segmentation.enabled) {
 244             base_qi = s->segmentation.base_quant[i];
 245             if (!s->segmentation.absolute_vals)
 246                 base_qi += yac_qi;
 247         } else
 248             base_qi = yac_qi;
 249
 250         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 251         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 252         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 253         /* 101581>>16 is equivalent to 155/100 */
 254         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 255         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 256         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 257
 258         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 259         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 260     }
 261 }
 262
 263 /**
 264  * Determine which buffers golden and altref should be updated with after this frame.
 265  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 266  *
 267  * Intra frames update all 3 references
 268  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 269  * If the update (golden|altref) flag is set, it's updated with the current frame
 270  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 271  * If the flag is not set, the number read means:
 272  *      0: no update
 273  *      1: VP56_FRAME_PREVIOUS
 274  *      2: update golden with altref, or update altref with golden
 275  */
 276 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 277 {
 278     VP56RangeCoder *c = &s->c;
 279
 280     if (update)
 281         return VP56_FRAME_CURRENT;
 282
 283     switch (vp8_rac_get_uint(c, 2)) {
 284     case 1:
 285         return VP56_FRAME_PREVIOUS;
 286     case 2:
 287         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 288     }
 289     return VP56_FRAME_NONE;
 290 }
 291
 292 static void update_refs(VP8Context *s)
 293 {
 294     VP56RangeCoder *c = &s->c;
 295
 296     int update_golden = vp8_rac_get(c);
 297     int update_altref = vp8_rac_get(c);
 298
 299     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 300     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 301 }
 302
 303 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 304 {
 305     VP56RangeCoder *c = &s->c;
 306     int header_size, hscale, vscale, i, j, k, l, m, ret;
 307     int width  = s->avctx->width;
 308     int height = s->avctx->height;
 309
 310     s->keyframe  = !(buf[0] & 1);
 311     s->profile   =  (buf[0]>>1) & 7;
 312     s->invisible = !(buf[0] & 0x10);
 313     header_size  = AV_RL24(buf) >> 5;
 314     buf      += 3;
 315     buf_size -= 3;
 316
 317     if (s->profile > 3)
 318         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 319
 320     if (!s->profile)
 321         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 322     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 323         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 324
 325     if (header_size > buf_size - 7*s->keyframe) {
 326         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 327         return AVERROR_INVALIDDATA;
 328     }
 329
 330     if (s->keyframe) {
 331         if (AV_RL24(buf) != 0x2a019d) {
 332             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 333             return AVERROR_INVALIDDATA;
 334         }
 335         width  = AV_RL16(buf+3) & 0x3fff;
 336         height = AV_RL16(buf+5) & 0x3fff;
 337         hscale = buf[4] >> 6;
 338         vscale = buf[6] >> 6;
 339         buf      += 7;
 340         buf_size -= 7;
 341
 342         if (hscale || vscale)
 343             avpriv_request_sample(s->avctx, "Upscaling");
 344
 345         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 346         for (i = 0; i < 4; i++)
 347             for (j = 0; j < 16; j++)
 348                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 349                        sizeof(s->prob->token[i][j]));
 350         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 351         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 352         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 353         memset(&s->segmentation, 0, sizeof(s->segmentation));
 354         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 355     }
 356
 357     ff_vp56_init_range_decoder(c, buf, header_size);
 358     buf      += header_size;
 359     buf_size -= header_size;
 360
 361     if (s->keyframe) {
 362         if (vp8_rac_get(c))
 363             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 364         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 365     }
 366
 367     if ((s->segmentation.enabled = vp8_rac_get(c)))
 368         parse_segment_info(s);
 369     else
 370         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 371
 372     s->filter.simple    = vp8_rac_get(c);
 373     s->filter.level     = vp8_rac_get_uint(c, 6);
 374     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 375
 376     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 377         if (vp8_rac_get(c))
 378             update_lf_deltas(s);
 379
 380     if (setup_partitions(s, buf, buf_size)) {
 381         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 382         return AVERROR_INVALIDDATA;
 383     }
 384
 385     if (!s->macroblocks_base || /* first frame */
 386         width != s->avctx->width || height != s->avctx->height || (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) {
 387         if ((ret = update_dimensions(s, width, height)) < 0)
 388             return ret;
 389     }
 390
 391     get_quants(s);
 392
 393     if (!s->keyframe) {
 394         update_refs(s);
 395         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 396         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 397     }
 398
 399     // if we aren't saving this frame's probabilities for future frames,
 400     // make a copy of the current probabilities
 401     if (!(s->update_probabilities = vp8_rac_get(c)))
 402         s->prob[1] = s->prob[0];
 403
 404     s->update_last = s->keyframe || vp8_rac_get(c);
 405
 406     for (i = 0; i < 4; i++)
 407         for (j = 0; j < 8; j++)
 408             for (k = 0; k < 3; k++)
 409                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 410                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 411                         int prob = vp8_rac_get_uint(c, 8);
 412                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 413                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 414                     }
 415
 416     if ((s->mbskip_enabled = vp8_rac_get(c)))
 417         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 418
 419     if (!s->keyframe) {
 420         s->prob->intra  = vp8_rac_get_uint(c, 8);
 421         s->prob->last   = vp8_rac_get_uint(c, 8);
 422         s->prob->golden = vp8_rac_get_uint(c, 8);
 423
 424         if (vp8_rac_get(c))
 425             for (i = 0; i < 4; i++)
 426                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 427         if (vp8_rac_get(c))
 428             for (i = 0; i < 3; i++)
 429                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 430
 431         // 17.2 MV probability update
 432         for (i = 0; i < 2; i++)
 433             for (j = 0; j < 19; j++)
 434                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 435                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 436     }
 437
 438     return 0;
 439 }
 440
 441 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 442 {
 443     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 444     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 445 }
 446
 447 /**
 448  * Motion vector coding, 17.1.
 449  */
 450 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 451 {
 452     int bit, x = 0;
 453
 454     if (vp56_rac_get_prob_branchy(c, p[0])) {
 455         int i;
 456
 457         for (i = 0; i < 3; i++)
 458             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 459         for (i = 9; i > 3; i--)
 460             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 461         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 462             x += 8;
 463     } else {
 464         // small_mvtree
 465         const uint8_t *ps = p+2;
 466         bit = vp56_rac_get_prob(c, *ps);
 467         ps += 1 + 3*bit;
 468         x  += 4*bit;
 469         bit = vp56_rac_get_prob(c, *ps);
 470         ps += 1 + bit;
 471         x  += 2*bit;
 472         x  += vp56_rac_get_prob(c, *ps);
 473     }
 474
 475     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 476 }
 477
 478 static av_always_inline
 479 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 480 {
 481     if (left == top)
 482         return vp8_submv_prob[4-!!left];
 483     if (!top)
 484         return vp8_submv_prob[2];
 485     return vp8_submv_prob[1-!!left];
 486 }
 487
 488 /**
 489  * Split motion vector prediction, 16.4.
 490  * @returns the number of motion vectors parsed (2, 4 or 16)
 491  */
 492 static av_always_inline
 493 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 494 {
 495     int part_idx;
 496     int n, num;
 497     VP8Macroblock *top_mb;
 498     VP8Macroblock *left_mb = &mb[-1];
 499     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 500                   *mbsplits_top,
 501                   *mbsplits_cur, *firstidx;
 502     VP56mv *top_mv;
 503     VP56mv *left_mv = left_mb->bmv;
 504     VP56mv *cur_mv  = mb->bmv;
 505
 506     if (!layout) // layout is inlined, s->mb_layout is not
 507         top_mb = &mb[2];
 508     else
 509         top_mb = &mb[-s->mb_width-1];
 510     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 511     top_mv = top_mb->bmv;
 512
 513     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 514         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 515             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 516         } else {
 517             part_idx = VP8_SPLITMVMODE_8x8;
 518         }
 519     } else {
 520         part_idx = VP8_SPLITMVMODE_4x4;
 521     }
 522
 523     num = vp8_mbsplit_count[part_idx];
 524     mbsplits_cur = vp8_mbsplits[part_idx],
 525     firstidx = vp8_mbfirstidx[part_idx];
 526     mb->partitioning = part_idx;
 527
 528     for (n = 0; n < num; n++) {
 529         int k = firstidx[n];
 530         uint32_t left, above;
 531         const uint8_t *submv_prob;
 532
 533         if (!(k & 3))
 534             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 535         else
 536             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 537         if (k <= 3)
 538             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 539         else
 540             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 541
 542         submv_prob = get_submv_prob(left, above);
 543
 544         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 545             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 546                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 547                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 548                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 549                 } else {
 550                     AV_ZERO32(&mb->bmv[n]);
 551                 }
 552             } else {
 553                 AV_WN32A(&mb->bmv[n], above);
 554             }
 555         } else {
 556             AV_WN32A(&mb->bmv[n], left);
 557         }
 558     }
 559
 560     return num;
 561 }
 562
 563 static av_always_inline
 564 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 565 {
 566     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 567                                   mb - 1 /* left */,
 568                                   0 /* top-left */ };
 569     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 570     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 571     int idx = CNT_ZERO;
 572     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 573     int8_t *sign_bias = s->sign_bias;
 574     VP56mv near_mv[4];
 575     uint8_t cnt[4] = { 0 };
 576     VP56RangeCoder *c = &s->c;
 577
 578     if (!layout) { // layout is inlined (s->mb_layout is not)
 579         mb_edge[0] = mb + 2;
 580         mb_edge[2] = mb + 1;
 581     }
 582     else {
 583         mb_edge[0] = mb - s->mb_width-1;
 584         mb_edge[2] = mb - s->mb_width-2;
 585     }
 586
 587     AV_ZERO32(&near_mv[0]);
 588     AV_ZERO32(&near_mv[1]);
 589     AV_ZERO32(&near_mv[2]);
 590
 591     /* Process MB on top, left and top-left */
 592     #define MV_EDGE_CHECK(n)\
 593     {\
 594         VP8Macroblock *edge = mb_edge[n];\
 595         int edge_ref = edge->ref_frame;\
 596         if (edge_ref != VP56_FRAME_CURRENT) {\
 597             uint32_t mv = AV_RN32A(&edge->mv);\
 598             if (mv) {\
 599                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 600                     /* SWAR negate of the values in mv. */\
 601                     mv = ~mv;\
 602                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 603                 }\
 604                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 605                     AV_WN32A(&near_mv[++idx], mv);\
 606                 cnt[idx]      += 1 + (n != 2);\
 607             } else\
 608                 cnt[CNT_ZERO] += 1 + (n != 2);\
 609         }\
 610     }
 611
 612     MV_EDGE_CHECK(0)
 613     MV_EDGE_CHECK(1)
 614     MV_EDGE_CHECK(2)
 615
 616     mb->partitioning = VP8_SPLITMVMODE_NONE;
 617     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 618         mb->mode = VP8_MVMODE_MV;
 619
 620         /* If we have three distinct MVs, merge first and last if they're the same */
 621         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 622             cnt[CNT_NEAREST] += 1;
 623
 624         /* Swap near and nearest if necessary */
 625         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 626             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 627             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 628         }
 629
 630         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 631             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 632
 633                 /* Choose the best mv out of 0,0 and the nearest mv */
 634                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 635                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 636                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 637                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 638
 639                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 640                     mb->mode = VP8_MVMODE_SPLIT;
 641                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 642                 } else {
 643                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 644                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 645                     mb->bmv[0] = mb->mv;
 646                 }
 647             } else {
 648                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 649                 mb->bmv[0] = mb->mv;
 650             }
 651         } else {
 652             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 653             mb->bmv[0] = mb->mv;
 654         }
 655     } else {
 656         mb->mode = VP8_MVMODE_ZERO;
 657         AV_ZERO32(&mb->mv);
 658         mb->bmv[0] = mb->mv;
 659     }
 660 }
 661
 662 static av_always_inline
 663 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 664                            int mb_x, int keyframe, int layout)
 665 {
 666     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 667
 668     if (layout == 1) {
 669         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 670         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 671     }
 672     if (keyframe) {
 673         int x, y;
 674         uint8_t* top;
 675         uint8_t* const left = s->intra4x4_pred_mode_left;
 676         if (layout == 1)
 677             top = mb->intra4x4_pred_mode_top;
 678         else
 679             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 680         for (y = 0; y < 4; y++) {
 681             for (x = 0; x < 4; x++) {
 682                 const uint8_t *ctx;
 683                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 684                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 685                 left[y] = top[x] = *intra4x4;
 686                 intra4x4++;
 687             }
 688         }
 689     } else {
 690         int i;
 691         for (i = 0; i < 16; i++)
 692             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 693     }
 694 }
 695
 696 static av_always_inline
 697 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 698                     uint8_t *segment, uint8_t *ref, int layout)
 699 {
 700     VP56RangeCoder *c = &s->c;
 701
 702     if (s->segmentation.update_map) {
 703         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 704         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 705     } else if (s->segmentation.enabled)
 706         *segment = ref ? *ref : *segment;
 707     mb->segment = *segment;
 708
 709     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 710
 711     if (s->keyframe) {
 712         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 713
 714         if (mb->mode == MODE_I4x4) {
 715             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 716         } else {
 717             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 718             if (s->mb_layout == 1)
 719                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 720             else
 721                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 722             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 723         }
 724
 725         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 726         mb->ref_frame = VP56_FRAME_CURRENT;
 727     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 728         // inter MB, 16.2
 729         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 730             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 731                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 732         else
 733             mb->ref_frame = VP56_FRAME_PREVIOUS;
 734         s->ref_count[mb->ref_frame-1]++;
 735
 736         // motion vectors, 16.3
 737         decode_mvs(s, mb, mb_x, mb_y, layout);
 738     } else {
 739         // intra MB, 16.1
 740         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 741
 742         if (mb->mode == MODE_I4x4)
 743             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 744
 745         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 746         mb->ref_frame = VP56_FRAME_CURRENT;
 747         mb->partitioning = VP8_SPLITMVMODE_NONE;
 748         AV_ZERO32(&mb->bmv[0]);
 749     }
 750 }
 751
 752 #ifndef decode_block_coeffs_internal
 753 /**
 754  * @param r arithmetic bitstream reader context
 755  * @param block destination for block coefficients
 756  * @param probs probabilities to use when reading trees from the bitstream
 757  * @param i initial coeff index, 0 unless a separate DC block is coded
 758  * @param qmul array holding the dc/ac dequant factor at position 0/1
 759  * @return 0 if no coeffs were decoded
 760  *         otherwise, the index of the last coeff decoded plus one
 761  */
 762 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
 763                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 764                                         int i, uint8_t *token_prob, int16_t qmul[2])
 765 {
 766     VP56RangeCoder c = *r;
 767     goto skip_eob;
 768     do {
 769         int coeff;
 770         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 771             break;
 772
 773 skip_eob:
 774         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 775             if (++i == 16)
 776                 break; // invalid input; blocks should end with EOB
 777             token_prob = probs[i][0];
 778             goto skip_eob;
 779         }
 780
 781         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 782             coeff = 1;
 783             token_prob = probs[i+1][1];
 784         } else {
 785             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 786                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 787                 if (coeff)
 788                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 789                 coeff += 2;
 790             } else {
 791                 // DCT_CAT*
 792                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 793                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 794                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 795                     } else {                                    // DCT_CAT2
 796                         coeff  = 7;
 797                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 798                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 799                     }
 800                 } else {    // DCT_CAT3 and up
 801                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 802                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 803                     int cat = (a<<1) + b;
 804                     coeff  = 3 + (8<<cat);
 805                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 806                 }
 807             }
 808             token_prob = probs[i+1][2];
 809         }
 810         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 811     } while (++i < 16);
 812
 813     *r = c;
 814     return i;
 815 }
 816 #endif
 817
 818 /**
 819  * @param c arithmetic bitstream reader context
 820  * @param block destination for block coefficients
 821  * @param probs probabilities to use when reading trees from the bitstream
 822  * @param i initial coeff index, 0 unless a separate DC block is coded
 823  * @param zero_nhood the initial prediction context for number of surrounding
 824  *                   all-zero blocks (only left/top, so 0-2)
 825  * @param qmul array holding the dc/ac dequant factor at position 0/1
 826  * @return 0 if no coeffs were decoded
 827  *         otherwise, the index of the last coeff decoded plus one
 828  */
 829 static av_always_inline
 830 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
 831                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 832                         int i, int zero_nhood, int16_t qmul[2])
 833 {
 834     uint8_t *token_prob = probs[i][zero_nhood];
 835     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 836         return 0;
 837     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 838 }
 839
 840 static av_always_inline
 841 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 842                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 843 {
 844     int i, x, y, luma_start = 0, luma_ctx = 3;
 845     int nnz_pred, nnz, nnz_total = 0;
 846     int segment = mb->segment;
 847     int block_dc = 0;
 848
 849     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 850         nnz_pred = t_nnz[8] + l_nnz[8];
 851
 852         // decode DC values and do hadamard
 853         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 854                                   s->qmat[segment].luma_dc_qmul);
 855         l_nnz[8] = t_nnz[8] = !!nnz;
 856         if (nnz) {
 857             nnz_total += nnz;
 858             block_dc = 1;
 859             if (nnz == 1)
 860                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 861             else
 862                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 863         }
 864         luma_start = 1;
 865         luma_ctx = 0;
 866     }
 867
 868     // luma blocks
 869     for (y = 0; y < 4; y++)
 870         for (x = 0; x < 4; x++) {
 871             nnz_pred = l_nnz[y] + t_nnz[x];
 872             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 873                                       nnz_pred, s->qmat[segment].luma_qmul);
 874             // nnz+block_dc may be one more than the actual last index, but we don't care
 875             td->non_zero_count_cache[y][x] = nnz + block_dc;
 876             t_nnz[x] = l_nnz[y] = !!nnz;
 877             nnz_total += nnz;
 878         }
 879
 880     // chroma blocks
 881     // TODO: what to do about dimensions? 2nd dim for luma is x,
 882     // but for chroma it's (y<<1)|x
 883     for (i = 4; i < 6; i++)
 884         for (y = 0; y < 2; y++)
 885             for (x = 0; x < 2; x++) {
 886                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 887                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 888                                           nnz_pred, s->qmat[segment].chroma_qmul);
 889                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 890                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 891                 nnz_total += nnz;
 892             }
 893
 894     // if there were no coded coeffs despite the macroblock not being marked skip,
 895     // we MUST not do the inner loop filter and should not do IDCT
 896     // Since skip isn't used for bitstream prediction, just manually set it.
 897     if (!nnz_total)
 898         mb->skip = 1;
 899 }
 900
 901 static av_always_inline
 902 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 903                       int linesize, int uvlinesize, int simple)
 904 {
 905     AV_COPY128(top_border, src_y + 15*linesize);
 906     if (!simple) {
 907         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 908         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 909     }
 910 }
 911
 912 static av_always_inline
 913 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 914                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 915                     int simple, int xchg)
 916 {
 917     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 918     src_y  -=   linesize;
 919     src_cb -= uvlinesize;
 920     src_cr -= uvlinesize;
 921
 922 #define XCHG(a,b,xchg) do {                     \
 923         if (xchg) AV_SWAP64(b,a);               \
 924         else      AV_COPY64(b,a);               \
 925     } while (0)
 926
 927     XCHG(top_border_m1+8, src_y-8, xchg);
 928     XCHG(top_border,      src_y,   xchg);
 929     XCHG(top_border+8,    src_y+8, 1);
 930     if (mb_x < mb_width-1)
 931         XCHG(top_border+32, src_y+16, 1);
 932
 933     // only copy chroma for normal loop filter
 934     // or to initialize the top row to 127
 935     if (!simple || !mb_y) {
 936         XCHG(top_border_m1+16, src_cb-8, xchg);
 937         XCHG(top_border_m1+24, src_cr-8, xchg);
 938         XCHG(top_border+16,    src_cb, 1);
 939         XCHG(top_border+24,    src_cr, 1);
 940     }
 941 }
 942
 943 static av_always_inline
 944 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 945 {
 946     if (!mb_x) {
 947         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 948     } else {
 949         return mb_y ? mode : LEFT_DC_PRED8x8;
 950     }
 951 }
 952
 953 static av_always_inline
 954 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 955 {
 956     if (!mb_x) {
 957         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 958     } else {
 959         return mb_y ? mode : HOR_PRED8x8;
 960     }
 961 }
 962
 963 static av_always_inline
 964 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 965 {
 966     if (mode == DC_PRED8x8) {
 967         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 968     } else {
 969         return mode;
 970     }
 971 }
 972
 973 static av_always_inline
 974 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 975 {
 976     switch (mode) {
 977     case DC_PRED8x8:
 978         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 979     case VERT_PRED8x8:
 980         return !mb_y ? DC_127_PRED8x8 : mode;
 981     case HOR_PRED8x8:
 982         return !mb_x ? DC_129_PRED8x8 : mode;
 983     case PLANE_PRED8x8 /*TM*/:
 984         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 985     }
 986     return mode;
 987 }
 988
 989 static av_always_inline
 990 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 991 {
 992     if (!mb_x) {
 993         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 994     } else {
 995         return mb_y ? mode : HOR_VP8_PRED;
 996     }
 997 }
 998
 999 static av_always_inline
1000 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1001 {
1002     switch (mode) {
1003     case VERT_PRED:
1004         if (!mb_x && mb_y) {
1005             *copy_buf = 1;
1006             return mode;
1007         }
1008         /* fall-through */
1009     case DIAG_DOWN_LEFT_PRED:
1010     case VERT_LEFT_PRED:
1011         return !mb_y ? DC_127_PRED : mode;
1012     case HOR_PRED:
1013         if (!mb_y) {
1014             *copy_buf = 1;
1015             return mode;
1016         }
1017         /* fall-through */
1018     case HOR_UP_PRED:
1019         return !mb_x ? DC_129_PRED : mode;
1020     case TM_VP8_PRED:
1021         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1022     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1023     case DIAG_DOWN_RIGHT_PRED:
1024     case VERT_RIGHT_PRED:
1025     case HOR_DOWN_PRED:
1026         if (!mb_y || !mb_x)
1027             *copy_buf = 1;
1028         return mode;
1029     }
1030     return mode;
1031 }
1032
1033 static av_always_inline
1034 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1035                    VP8Macroblock *mb, int mb_x, int mb_y)
1036 {
1037     AVCodecContext *avctx = s->avctx;
1038     int x, y, mode, nnz;
1039     uint32_t tr;
1040
1041     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1042     // otherwise, skip it if we aren't going to deblock
1043     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1044         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1045                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1046                        s->filter.simple, 1);
1047
1048     if (mb->mode < MODE_I4x4) {
1049         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1050             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1051         } else {
1052             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1053         }
1054         s->hpc.pred16x16[mode](dst[0], s->linesize);
1055     } else {
1056         uint8_t *ptr = dst[0];
1057         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1058         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1059
1060         // all blocks on the right edge of the macroblock use bottom edge
1061         // the top macroblock for their topright edge
1062         uint8_t *tr_right = ptr - s->linesize + 16;
1063
1064         // if we're on the right edge of the frame, said edge is extended
1065         // from the top macroblock
1066         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1067             mb_x == s->mb_width-1) {
1068             tr = tr_right[-1]*0x01010101u;
1069             tr_right = (uint8_t *)&tr;
1070         }
1071
1072         if (mb->skip)
1073             AV_ZERO128(td->non_zero_count_cache);
1074
1075         for (y = 0; y < 4; y++) {
1076             uint8_t *topright = ptr + 4 - s->linesize;
1077             for (x = 0; x < 4; x++) {
1078                 int copy = 0, linesize = s->linesize;
1079                 uint8_t *dst = ptr+4*x;
1080                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1081
1082                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1083                     topright = tr_top;
1084                 } else if (x == 3)
1085                     topright = tr_right;
1086
1087                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1088                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1089                     if (copy) {
1090                         dst = copy_dst + 12;
1091                         linesize = 8;
1092                         if (!(mb_y + y)) {
1093                             copy_dst[3] = 127U;
1094                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1095                         } else {
1096                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1097                             if (!(mb_x + x)) {
1098                                 copy_dst[3] = 129U;
1099                             } else {
1100                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1101                             }
1102                         }
1103                         if (!(mb_x + x)) {
1104                             copy_dst[11] =
1105                             copy_dst[19] =
1106                             copy_dst[27] =
1107                             copy_dst[35] = 129U;
1108                         } else {
1109                             copy_dst[11] = ptr[4*x              -1];
1110                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1111                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1112                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1113                         }
1114                     }
1115                 } else {
1116                     mode = intra4x4[x];
1117                 }
1118                 s->hpc.pred4x4[mode](dst, topright, linesize);
1119                 if (copy) {
1120                     AV_COPY32(ptr+4*x              , copy_dst+12);
1121                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1122                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1123                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1124                 }
1125
1126                 nnz = td->non_zero_count_cache[y][x];
1127                 if (nnz) {
1128                     if (nnz == 1)
1129                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1130                     else
1131                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1132                 }
1133                 topright += 4;
1134             }
1135
1136             ptr   += 4*s->linesize;
1137             intra4x4 += 4;
1138         }
1139     }
1140
1141     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1142         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1143     } else {
1144         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1145     }
1146     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1147     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1148
1149     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1150         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1151                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1152                        s->filter.simple, 0);
1153 }
1154
1155 static const uint8_t subpel_idx[3][8] = {
1156     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1157                                 // also function pointer index
1158     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1159     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1160 };
1161
1162 /**
1163  * luma MC function
1164  *
1165  * @param s VP8 decoding context
1166  * @param dst target buffer for block data at block position
1167  * @param ref reference picture buffer at origin (0, 0)
1168  * @param mv motion vector (relative to block position) to get pixel data from
1169  * @param x_off horizontal position of block from origin (0, 0)
1170  * @param y_off vertical position of block from origin (0, 0)
1171  * @param block_w width of block (16, 8 or 4)
1172  * @param block_h height of block (always same as block_w)
1173  * @param width width of src/dst plane data
1174  * @param height height of src/dst plane data
1175  * @param linesize size of a single line of plane data, including padding
1176  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1177  */
1178 static av_always_inline
1179 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1180                  ThreadFrame *ref, const VP56mv *mv,
1181                  int x_off, int y_off, int block_w, int block_h,
1182                  int width, int height, ptrdiff_t linesize,
1183                  vp8_mc_func mc_func[3][3])
1184 {
1185     uint8_t *src = ref->f->data[0];
1186
1187     if (AV_RN32A(mv)) {
1188         int src_linesize = linesize;
1189         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1190         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1191
1192         x_off += mv->x >> 2;
1193         y_off += mv->y >> 2;
1194
1195         // edge emulation
1196         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1197         src += y_off * linesize + x_off;
1198         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1199             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1200             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32,
1201                                      src - my_idx * linesize - mx_idx, linesize,
1202                                      block_w + subpel_idx[1][mx],
1203                                      block_h + subpel_idx[1][my],
1204                                      x_off - mx_idx, y_off - my_idx, width, height);
1205             src = td->edge_emu_buffer + mx_idx + 32 * my_idx;
1206             src_linesize = 32;
1207         }
1208         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1209     } else {
1210         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1211         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1212     }
1213 }
1214
1215 /**
1216  * chroma MC function
1217  *
1218  * @param s VP8 decoding context
1219  * @param dst1 target buffer for block data at block position (U plane)
1220  * @param dst2 target buffer for block data at block position (V plane)
1221  * @param ref reference picture buffer at origin (0, 0)
1222  * @param mv motion vector (relative to block position) to get pixel data from
1223  * @param x_off horizontal position of block from origin (0, 0)
1224  * @param y_off vertical position of block from origin (0, 0)
1225  * @param block_w width of block (16, 8 or 4)
1226  * @param block_h height of block (always same as block_w)
1227  * @param width width of src/dst plane data
1228  * @param height height of src/dst plane data
1229  * @param linesize size of a single line of plane data, including padding
1230  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1231  */
1232 static av_always_inline
1233 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1234                    ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1235                    int block_w, int block_h, int width, int height, ptrdiff_t linesize,
1236                    vp8_mc_func mc_func[3][3])
1237 {
1238     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1239
1240     if (AV_RN32A(mv)) {
1241         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1242         int my = mv->y&7, my_idx = subpel_idx[0][my];
1243
1244         x_off += mv->x >> 3;
1245         y_off += mv->y >> 3;
1246
1247         // edge emulation
1248         src1 += y_off * linesize + x_off;
1249         src2 += y_off * linesize + x_off;
1250         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1251         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1252             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1253             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32,
1254                                      src1 - my_idx * linesize - mx_idx, linesize,
1255                                      block_w + subpel_idx[1][mx],
1256                                      block_h + subpel_idx[1][my],
1257                                      x_off - mx_idx, y_off - my_idx, width, height);
1258             src1 = td->edge_emu_buffer + mx_idx + 32 * my_idx;
1259             mc_func[my_idx][mx_idx](dst1, linesize, src1, 32, block_h, mx, my);
1260
1261             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32,
1262                                      src2 - my_idx * linesize - mx_idx, linesize,
1263                                      block_w + subpel_idx[1][mx],
1264                                      block_h + subpel_idx[1][my],
1265                                      x_off - mx_idx, y_off - my_idx, width, height);
1266             src2 = td->edge_emu_buffer + mx_idx + 32 * my_idx;
1267             mc_func[my_idx][mx_idx](dst2, linesize, src2, 32, block_h, mx, my);
1268         } else {
1269             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1270             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1271         }
1272     } else {
1273         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1274         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1275         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1276     }
1277 }
1278
1279 static av_always_inline
1280 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1281                  ThreadFrame *ref_frame, int x_off, int y_off,
1282                  int bx_off, int by_off,
1283                  int block_w, int block_h,
1284                  int width, int height, VP56mv *mv)
1285 {
1286     VP56mv uvmv = *mv;
1287
1288     /* Y */
1289     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1290                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1291                 block_w, block_h, width, height, s->linesize,
1292                 s->put_pixels_tab[block_w == 8]);
1293
1294     /* U/V */
1295     if (s->profile == 3) {
1296         uvmv.x &= ~7;
1297         uvmv.y &= ~7;
1298     }
1299     x_off   >>= 1; y_off   >>= 1;
1300     bx_off  >>= 1; by_off  >>= 1;
1301     width   >>= 1; height  >>= 1;
1302     block_w >>= 1; block_h >>= 1;
1303     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1304                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1305                   &uvmv, x_off + bx_off, y_off + by_off,
1306                   block_w, block_h, width, height, s->uvlinesize,
1307                   s->put_pixels_tab[1 + (block_w == 4)]);
1308 }
1309
1310 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1311  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1312 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1313 {
1314     /* Don't prefetch refs that haven't been used very often this frame. */
1315     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1316         int x_off = mb_x << 4, y_off = mb_y << 4;
1317         int mx = (mb->mv.x>>2) + x_off + 8;
1318         int my = (mb->mv.y>>2) + y_off;
1319         uint8_t **src= s->framep[ref]->tf.f->data;
1320         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1321         /* For threading, a ff_thread_await_progress here might be useful, but
1322          * it actually slows down the decoder. Since a bad prefetch doesn't
1323          * generate bad decoder output, we don't run it here. */
1324         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1325         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1326         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1327     }
1328 }
1329
1330 /**
1331  * Apply motion vectors to prediction buffer, chapter 18.
1332  */
1333 static av_always_inline
1334 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1335                    VP8Macroblock *mb, int mb_x, int mb_y)
1336 {
1337     int x_off = mb_x << 4, y_off = mb_y << 4;
1338     int width = 16*s->mb_width, height = 16*s->mb_height;
1339     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1340     VP56mv *bmv = mb->bmv;
1341
1342     switch (mb->partitioning) {
1343     case VP8_SPLITMVMODE_NONE:
1344         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1345                     0, 0, 16, 16, width, height, &mb->mv);
1346         break;
1347     case VP8_SPLITMVMODE_4x4: {
1348         int x, y;
1349         VP56mv uvmv;
1350
1351         /* Y */
1352         for (y = 0; y < 4; y++) {
1353             for (x = 0; x < 4; x++) {
1354                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1355                             ref, &bmv[4*y + x],
1356                             4*x + x_off, 4*y + y_off, 4, 4,
1357                             width, height, s->linesize,
1358                             s->put_pixels_tab[2]);
1359             }
1360         }
1361
1362         /* U/V */
1363         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1364         for (y = 0; y < 2; y++) {
1365             for (x = 0; x < 2; x++) {
1366                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1367                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1368                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1369                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1370                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1371                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1372                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1373                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1374                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1375                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1376                 if (s->profile == 3) {
1377                     uvmv.x &= ~7;
1378                     uvmv.y &= ~7;
1379                 }
1380                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1381                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1382                               4*x + x_off, 4*y + y_off, 4, 4,
1383                               width, height, s->uvlinesize,
1384                               s->put_pixels_tab[2]);
1385             }
1386         }
1387         break;
1388     }
1389     case VP8_SPLITMVMODE_16x8:
1390         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1391                     0, 0, 16, 8, width, height, &bmv[0]);
1392         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1393                     0, 8, 16, 8, width, height, &bmv[1]);
1394         break;
1395     case VP8_SPLITMVMODE_8x16:
1396         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1397                     0, 0, 8, 16, width, height, &bmv[0]);
1398         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1399                     8, 0, 8, 16, width, height, &bmv[1]);
1400         break;
1401     case VP8_SPLITMVMODE_8x8:
1402         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1403                     0, 0, 8, 8, width, height, &bmv[0]);
1404         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1405                     8, 0, 8, 8, width, height, &bmv[1]);
1406         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1407                     0, 8, 8, 8, width, height, &bmv[2]);
1408         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1409                     8, 8, 8, 8, width, height, &bmv[3]);
1410         break;
1411     }
1412 }
1413
1414 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1415                                      uint8_t *dst[3], VP8Macroblock *mb)
1416 {
1417     int x, y, ch;
1418
1419     if (mb->mode != MODE_I4x4) {
1420         uint8_t *y_dst = dst[0];
1421         for (y = 0; y < 4; y++) {
1422             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1423             if (nnz4) {
1424                 if (nnz4&~0x01010101) {
1425                     for (x = 0; x < 4; x++) {
1426                         if ((uint8_t)nnz4 == 1)
1427                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1428                         else if((uint8_t)nnz4 > 1)
1429                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1430                         nnz4 >>= 8;
1431                         if (!nnz4)
1432                             break;
1433                     }
1434                 } else {
1435                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1436                 }
1437             }
1438             y_dst += 4*s->linesize;
1439         }
1440     }
1441
1442     for (ch = 0; ch < 2; ch++) {
1443         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1444         if (nnz4) {
1445             uint8_t *ch_dst = dst[1+ch];
1446             if (nnz4&~0x01010101) {
1447                 for (y = 0; y < 2; y++) {
1448                     for (x = 0; x < 2; x++) {
1449                         if ((uint8_t)nnz4 == 1)
1450                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1451                         else if((uint8_t)nnz4 > 1)
1452                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1453                         nnz4 >>= 8;
1454                         if (!nnz4)
1455                             goto chroma_idct_end;
1456                     }
1457                     ch_dst += 4*s->uvlinesize;
1458                 }
1459             } else {
1460                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1461             }
1462         }
1463 chroma_idct_end: ;
1464     }
1465 }
1466
1467 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1468 {
1469     int interior_limit, filter_level;
1470
1471     if (s->segmentation.enabled) {
1472         filter_level = s->segmentation.filter_level[mb->segment];
1473         if (!s->segmentation.absolute_vals)
1474             filter_level += s->filter.level;
1475     } else
1476         filter_level = s->filter.level;
1477
1478     if (s->lf_delta.enabled) {
1479         filter_level += s->lf_delta.ref[mb->ref_frame];
1480         filter_level += s->lf_delta.mode[mb->mode];
1481     }
1482
1483     filter_level = av_clip_uintp2(filter_level, 6);
1484
1485     interior_limit = filter_level;
1486     if (s->filter.sharpness) {
1487         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1488         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1489     }
1490     interior_limit = FFMAX(interior_limit, 1);
1491
1492     f->filter_level = filter_level;
1493     f->inner_limit = interior_limit;
1494     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1495 }
1496
1497 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1498 {
1499     int mbedge_lim, bedge_lim, hev_thresh;
1500     int filter_level = f->filter_level;
1501     int inner_limit = f->inner_limit;
1502     int inner_filter = f->inner_filter;
1503     int linesize = s->linesize;
1504     int uvlinesize = s->uvlinesize;
1505     static const uint8_t hev_thresh_lut[2][64] = {
1506         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1507           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1508           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1509           3, 3, 3, 3 },
1510         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1511           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1512           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1513           2, 2, 2, 2 }
1514     };
1515
1516     if (!filter_level)
1517         return;
1518
1519      bedge_lim = 2*filter_level + inner_limit;
1520     mbedge_lim = bedge_lim + 4;
1521
1522     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1523
1524     if (mb_x) {
1525         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1526                                        mbedge_lim, inner_limit, hev_thresh);
1527         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1528                                        mbedge_lim, inner_limit, hev_thresh);
1529     }
1530
1531     if (inner_filter) {
1532         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1533                                              inner_limit, hev_thresh);
1534         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1535                                              inner_limit, hev_thresh);
1536         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1537                                              inner_limit, hev_thresh);
1538         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1539                                              uvlinesize,  bedge_lim,
1540                                              inner_limit, hev_thresh);
1541     }
1542
1543     if (mb_y) {
1544         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1545                                        mbedge_lim, inner_limit, hev_thresh);
1546         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1547                                        mbedge_lim, inner_limit, hev_thresh);
1548     }
1549
1550     if (inner_filter) {
1551         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1552                                              linesize,    bedge_lim,
1553                                              inner_limit, hev_thresh);
1554         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1555                                              linesize,    bedge_lim,
1556                                              inner_limit, hev_thresh);
1557         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1558                                              linesize,    bedge_lim,
1559                                              inner_limit, hev_thresh);
1560         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1561                                              dst[2] + 4 * uvlinesize,
1562                                              uvlinesize,  bedge_lim,
1563                                              inner_limit, hev_thresh);
1564     }
1565 }
1566
1567 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1568 {
1569     int mbedge_lim, bedge_lim;
1570     int filter_level = f->filter_level;
1571     int inner_limit = f->inner_limit;
1572     int inner_filter = f->inner_filter;
1573     int linesize = s->linesize;
1574
1575     if (!filter_level)
1576         return;
1577
1578      bedge_lim = 2*filter_level + inner_limit;
1579     mbedge_lim = bedge_lim + 4;
1580
1581     if (mb_x)
1582         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1583     if (inner_filter) {
1584         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1585         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1586         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1587     }
1588
1589     if (mb_y)
1590         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1591     if (inner_filter) {
1592         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1593         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1594         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1595     }
1596 }
1597
1598 #define MARGIN (16 << 2)
1599 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1600                                    VP8Frame *prev_frame)
1601 {
1602     VP8Context *s = avctx->priv_data;
1603     int mb_x, mb_y;
1604
1605     s->mv_min.y = -MARGIN;
1606     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1607     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1608         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1609         int mb_xy = mb_y*s->mb_width;
1610
1611         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1612
1613         s->mv_min.x = -MARGIN;
1614         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1615         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1616             if (mb_y == 0)
1617                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1618             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1619                            prev_frame && prev_frame->seg_map ?
1620                            prev_frame->seg_map->data + mb_xy : NULL, 1);
1621             s->mv_min.x -= 64;
1622             s->mv_max.x -= 64;
1623         }
1624         s->mv_min.y -= 64;
1625         s->mv_max.y -= 64;
1626     }
1627 }
1628
1629 #if HAVE_THREADS
1630 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1631     do {\
1632         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1633         if (otd->thread_mb_pos < tmp) {\
1634             pthread_mutex_lock(&otd->lock);\
1635             td->wait_mb_pos = tmp;\
1636             do {\
1637                 if (otd->thread_mb_pos >= tmp)\
1638                     break;\
1639                 pthread_cond_wait(&otd->cond, &otd->lock);\
1640             } while (1);\
1641             td->wait_mb_pos = INT_MAX;\
1642             pthread_mutex_unlock(&otd->lock);\
1643         }\
1644     } while(0);
1645
1646 #define update_pos(td, mb_y, mb_x)\
1647     do {\
1648     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1649     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1650     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1651     int pos_check        = (is_null) ? 1 :\
1652                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1653                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1654     td->thread_mb_pos = pos;\
1655     if (sliced_threading && pos_check) {\
1656         pthread_mutex_lock(&td->lock);\
1657         pthread_cond_broadcast(&td->cond);\
1658         pthread_mutex_unlock(&td->lock);\
1659     }\
1660     } while(0);
1661 #else
1662 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1663 #define update_pos(td, mb_y, mb_x)
1664 #endif
1665
1666 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1667                                         int jobnr, int threadnr)
1668 {
1669     VP8Context *s = avctx->priv_data;
1670     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1671     int mb_y = td->thread_mb_pos>>16;
1672     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1673     int num_jobs = s->num_jobs;
1674     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1675     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1676     VP8Macroblock *mb;
1677     uint8_t *dst[3] = {
1678         curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1679         curframe->tf.f->data[1] +  8*mb_y*s->uvlinesize,
1680         curframe->tf.f->data[2] +  8*mb_y*s->uvlinesize
1681     };
1682     if (mb_y == 0) prev_td = td;
1683     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1684     if (mb_y == s->mb_height-1) next_td = td;
1685     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1686     if (s->mb_layout == 1)
1687         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1688     else {
1689         // Make sure the previous frame has read its segmentation map,
1690         // if we re-use the same map.
1691         if (prev_frame && s->segmentation.enabled &&
1692             !s->segmentation.update_map)
1693             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
1694         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1695         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1696         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1697     }
1698
1699     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1700     // left edge of 129 for intra prediction
1701     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1702         for (i = 0; i < 3; i++)
1703             for (y = 0; y < 16>>!!i; y++)
1704                 dst[i][y*curframe->tf.f->linesize[i]-1] = 129;
1705         if (mb_y == 1) {
1706             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1707         }
1708     }
1709
1710     s->mv_min.x = -MARGIN;
1711     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1712
1713     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1714         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1715         if (prev_td != td) {
1716             if (threadnr != 0) {
1717                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1718             } else {
1719                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1720             }
1721         }
1722
1723         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1724         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1725
1726         if (!s->mb_layout)
1727             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1728                            prev_frame && prev_frame->seg_map ?
1729                            prev_frame->seg_map->data + mb_xy : NULL, 0);
1730
1731         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1732
1733         if (!mb->skip)
1734             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1735
1736         if (mb->mode <= MODE_I4x4)
1737             intra_predict(s, td, dst, mb, mb_x, mb_y);
1738         else
1739             inter_predict(s, td, dst, mb, mb_x, mb_y);
1740
1741         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1742
1743         if (!mb->skip) {
1744             idct_mb(s, td, dst, mb);
1745         } else {
1746             AV_ZERO64(td->left_nnz);
1747             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1748
1749             // Reset DC block predictors if they would exist if the mb had coefficients
1750             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1751                 td->left_nnz[8]     = 0;
1752                 s->top_nnz[mb_x][8] = 0;
1753             }
1754         }
1755
1756         if (s->deblock_filter)
1757             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1758
1759         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1760             if (s->filter.simple)
1761                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1762             else
1763                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1764         }
1765
1766         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1767
1768         dst[0] += 16;
1769         dst[1] += 8;
1770         dst[2] += 8;
1771         s->mv_min.x -= 64;
1772         s->mv_max.x -= 64;
1773
1774         if (mb_x == s->mb_width+1) {
1775             update_pos(td, mb_y, s->mb_width+3);
1776         } else {
1777             update_pos(td, mb_y, mb_x);
1778         }
1779     }
1780 }
1781
1782 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1783                               int jobnr, int threadnr)
1784 {
1785     VP8Context *s = avctx->priv_data;
1786     VP8ThreadData *td = &s->thread_data[threadnr];
1787     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1788     AVFrame *curframe = s->curframe->tf.f;
1789     VP8Macroblock *mb;
1790     VP8ThreadData *prev_td, *next_td;
1791     uint8_t *dst[3] = {
1792         curframe->data[0] + 16*mb_y*s->linesize,
1793         curframe->data[1] +  8*mb_y*s->uvlinesize,
1794         curframe->data[2] +  8*mb_y*s->uvlinesize
1795     };
1796
1797     if (s->mb_layout == 1)
1798         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1799     else
1800         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1801
1802     if (mb_y == 0) prev_td = td;
1803     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1804     if (mb_y == s->mb_height-1) next_td = td;
1805     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1806
1807     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1808         VP8FilterStrength *f = &td->filter_strength[mb_x];
1809         if (prev_td != td) {
1810             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1811         }
1812         if (next_td != td)
1813             if (next_td != &s->thread_data[0]) {
1814                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1815             }
1816
1817         if (num_jobs == 1) {
1818             if (s->filter.simple)
1819                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1820             else
1821                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1822         }
1823
1824         if (s->filter.simple)
1825             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1826         else
1827             filter_mb(s, dst, f, mb_x, mb_y);
1828         dst[0] += 16;
1829         dst[1] += 8;
1830         dst[2] += 8;
1831
1832         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1833     }
1834 }
1835
1836 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1837                                     int jobnr, int threadnr)
1838 {
1839     VP8Context *s = avctx->priv_data;
1840     VP8ThreadData *td = &s->thread_data[jobnr];
1841     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1842     VP8Frame *curframe = s->curframe;
1843     int mb_y, num_jobs = s->num_jobs;
1844     td->thread_nr = threadnr;
1845     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1846         if (mb_y >= s->mb_height) break;
1847         td->thread_mb_pos = mb_y<<16;
1848         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1849         if (s->deblock_filter)
1850             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1851         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1852
1853         s->mv_min.y -= 64;
1854         s->mv_max.y -= 64;
1855
1856         if (avctx->active_thread_type == FF_THREAD_FRAME)
1857             ff_thread_report_progress(&curframe->tf, mb_y, 0);
1858     }
1859
1860     return 0;
1861 }
1862
1863 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1864                         AVPacket *avpkt)
1865 {
1866     VP8Context *s = avctx->priv_data;
1867     int ret, i, referenced, num_jobs;
1868     enum AVDiscard skip_thresh;
1869     VP8Frame *av_uninit(curframe), *prev_frame;
1870
1871     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1872         goto err;
1873
1874     prev_frame = s->framep[VP56_FRAME_CURRENT];
1875
1876     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1877                                 || s->update_altref == VP56_FRAME_CURRENT;
1878
1879     skip_thresh = !referenced ? AVDISCARD_NONREF :
1880                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1881
1882     if (avctx->skip_frame >= skip_thresh) {
1883         s->invisible = 1;
1884         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1885         goto skip_decode;
1886     }
1887     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1888
1889     // release no longer referenced frames
1890     for (i = 0; i < 5; i++)
1891         if (s->frames[i].tf.f->data[0] &&
1892             &s->frames[i] != prev_frame &&
1893             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1894             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1895             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1896             vp8_release_frame(s, &s->frames[i]);
1897
1898     // find a free buffer
1899     for (i = 0; i < 5; i++)
1900         if (&s->frames[i] != prev_frame &&
1901             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1902             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1903             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1904             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1905             break;
1906         }
1907     if (i == 5) {
1908         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1909         abort();
1910     }
1911     if (curframe->tf.f->data[0])
1912         vp8_release_frame(s, curframe);
1913
1914     // Given that arithmetic probabilities are updated every frame, it's quite likely
1915     // that the values we have on a random interframe are complete junk if we didn't
1916     // start decode on a keyframe. So just don't display anything rather than junk.
1917     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1918                          !s->framep[VP56_FRAME_GOLDEN] ||
1919                          !s->framep[VP56_FRAME_GOLDEN2])) {
1920         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1921         ret = AVERROR_INVALIDDATA;
1922         goto err;
1923     }
1924
1925     curframe->tf.f->key_frame = s->keyframe;
1926     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1927     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
1928         goto err;
1929
1930     // check if golden and altref are swapped
1931     if (s->update_altref != VP56_FRAME_NONE) {
1932         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1933     } else {
1934         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1935     }
1936     if (s->update_golden != VP56_FRAME_NONE) {
1937         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1938     } else {
1939         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1940     }
1941     if (s->update_last) {
1942         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1943     } else {
1944         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1945     }
1946     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1947
1948     ff_thread_finish_setup(avctx);
1949
1950     s->linesize   = curframe->tf.f->linesize[0];
1951     s->uvlinesize = curframe->tf.f->linesize[1];
1952
1953     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1954     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1955     if (!s->mb_layout)
1956         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1957     if (!s->mb_layout && s->keyframe)
1958         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1959
1960     // top edge of 127 for intra prediction
1961     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1962         s->top_border[0][15] = s->top_border[0][23] = 127;
1963         s->top_border[0][31] = 127;
1964         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1965     }
1966     memset(s->ref_count, 0, sizeof(s->ref_count));
1967
1968
1969     if (s->mb_layout == 1) {
1970         // Make sure the previous frame has read its segmentation map,
1971         // if we re-use the same map.
1972         if (prev_frame && s->segmentation.enabled &&
1973             !s->segmentation.update_map)
1974             ff_thread_await_progress(&prev_frame->tf, 1, 0);
1975         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1976     }
1977
1978     if (avctx->active_thread_type == FF_THREAD_FRAME)
1979         num_jobs = 1;
1980     else
1981         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1982     s->num_jobs   = num_jobs;
1983     s->curframe   = curframe;
1984     s->prev_frame = prev_frame;
1985     s->mv_min.y   = -MARGIN;
1986     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1987     for (i = 0; i < MAX_THREADS; i++) {
1988         s->thread_data[i].thread_mb_pos = 0;
1989         s->thread_data[i].wait_mb_pos = INT_MAX;
1990     }
1991     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1992
1993     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1994     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1995
1996 skip_decode:
1997     // if future frames don't use the updated probabilities,
1998     // reset them to the values we saved
1999     if (!s->update_probabilities)
2000         s->prob[0] = s->prob[1];
2001
2002     if (!s->invisible) {
2003         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2004             return ret;
2005         *got_frame      = 1;
2006     }
2007
2008     return avpkt->size;
2009 err:
2010     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2011     return ret;
2012 }
2013
2014 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2015 {
2016     VP8Context *s = avctx->priv_data;
2017     int i;
2018
2019     vp8_decode_flush_impl(avctx, 1);
2020     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2021         av_frame_free(&s->frames[i].tf.f);
2022
2023     return 0;
2024 }
2025
2026 static av_cold int vp8_init_frames(VP8Context *s)
2027 {
2028     int i;
2029     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2030         s->frames[i].tf.f = av_frame_alloc();
2031         if (!s->frames[i].tf.f)
2032             return AVERROR(ENOMEM);
2033     }
2034     return 0;
2035 }
2036
2037 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2038 {
2039     VP8Context *s = avctx->priv_data;
2040     int ret;
2041
2042     s->avctx = avctx;
2043     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2044     avctx->internal->allocate_progress = 1;
2045
2046     ff_videodsp_init(&s->vdsp, 8);
2047     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2048     ff_vp8dsp_init(&s->vp8dsp);
2049
2050     if ((ret = vp8_init_frames(s)) < 0) {
2051         ff_vp8_decode_free(avctx);
2052         return ret;
2053     }
2054
2055     return 0;
2056 }
2057
2058 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2059 {
2060     VP8Context *s = avctx->priv_data;
2061     int ret;
2062
2063     s->avctx = avctx;
2064
2065     if ((ret = vp8_init_frames(s)) < 0) {
2066         ff_vp8_decode_free(avctx);
2067         return ret;
2068     }
2069
2070     return 0;
2071 }
2072
2073 #define REBASE(pic) \
2074     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2075
2076 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2077 {
2078     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2079     int i;
2080
2081     if (s->macroblocks_base &&
2082         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2083         free_buffers(s);
2084         s->mb_width  = s_src->mb_width;
2085         s->mb_height = s_src->mb_height;
2086     }
2087
2088     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2089     s->segmentation = s_src->segmentation;
2090     s->lf_delta = s_src->lf_delta;
2091     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2092
2093     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2094         if (s_src->frames[i].tf.f->data[0]) {
2095             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2096             if (ret < 0)
2097                 return ret;
2098         }
2099     }
2100
2101     s->framep[0] = REBASE(s_src->next_framep[0]);
2102     s->framep[1] = REBASE(s_src->next_framep[1]);
2103     s->framep[2] = REBASE(s_src->next_framep[2]);
2104     s->framep[3] = REBASE(s_src->next_framep[3]);
2105
2106     return 0;
2107 }
2108
2109 static unsigned apply_padding(unsigned size) { return size + (size & 1); }
2110
2111 static int webp_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
2112                              AVPacket *avpkt)
2113 {
2114     const uint8_t *buf = avpkt->data;
2115     int buf_size       = avpkt->size;
2116     AVPacket pkt       = *avpkt;
2117
2118     if (buf_size >= 16
2119         && AV_RL32(buf   ) == AV_RL32("RIFF")
2120         && AV_RL32(buf+ 8) == AV_RL32("WEBP")) {
2121         unsigned riff_size = apply_padding(AV_RL32(buf+4)) + 8;
2122         buf += 12;   // Skip over main header
2123         buf_size -= 12;
2124         if (buf_size < 8 || riff_size < 8) {
2125             av_log(avctx, AV_LOG_ERROR, "Incomplete header.\n");
2126             return AVERROR_INVALIDDATA;
2127         }
2128         if (AV_RL32(buf) == AV_RL32("VP8L")) {
2129             av_log(avctx, AV_LOG_ERROR, "Unsupported WebP lossless format.\n");
2130             return AVERROR_PATCHWELCOME;
2131         }
2132         if (AV_RL32(buf) == AV_RL32("VP8X") && AV_RL32(buf+4) < (unsigned)buf_size) {
2133             unsigned size = apply_padding(AV_RL32(buf+4) + 8);
2134             buf      += size;
2135             buf_size -= size;
2136         }
2137         if (buf_size >= 8
2138             && AV_RL32(buf) == AV_RL32("ALPH") && AV_RL32(buf+4) < (unsigned)buf_size) {
2139             unsigned size = apply_padding(AV_RL32(buf+4) + 8);
2140             buf      += size;
2141             buf_size -= size;
2142             av_log(avctx, AV_LOG_WARNING, "Skipping alpha plane\n");
2143         }
2144         if (buf_size >= 8 && AV_RL32(buf) == AV_RL32("VP8 ")) {
2145             buf      += 8;
2146             buf_size -= 8;
2147         }
2148     }
2149     pkt.data = buf;
2150     pkt.size = buf_size;
2151
2152     return ff_vp8_decode_frame(avctx, data, data_size, &pkt);
2153 }
2154
2155 AVCodec ff_vp8_decoder = {
2156     .name                  = "vp8",
2157     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2158     .type                  = AVMEDIA_TYPE_VIDEO,
2159     .id                    = AV_CODEC_ID_VP8,
2160     .priv_data_size        = sizeof(VP8Context),
2161     .init                  = ff_vp8_decode_init,
2162     .close                 = ff_vp8_decode_free,
2163     .decode                = ff_vp8_decode_frame,
2164     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2165     .flush                 = vp8_decode_flush,
2166     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2167     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2168 };
2169
2170 // AVCodec ff_webp_decoder = {
2171 //     .name                  = "webp",
2172 //     .long_name             = NULL_IF_CONFIG_SMALL("WebP"),
2173 //     .type                  = AVMEDIA_TYPE_VIDEO,
2174 //     .id                    = AV_CODEC_ID_WEBP,
2175 //     .priv_data_size        = sizeof(VP8Context),
2176 //     .init                  = vp8_decode_init,
2177 //     .close                 = vp8_decode_free,
2178 //     .decode                = webp_decode_frame,
2179 //     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2180 //     .flush                 = vp8_decode_flush,
2181 //     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2182 //     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2183 // };