git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "libavcore/imgutils.h"
  29 #include "internal.h"
  30 #include "dsputil.h"
  31 #include "avcodec.h"
  32 #include "mpegvideo.h"
  33 #include "h264.h"
  34 #include "h264data.h"
  35 #include "h264_mvpred.h"
  36 #include "h264_parser.h"
  37 #include "golomb.h"
  38 #include "mathops.h"
  39 #include "rectangle.h"
  40 #include "vdpau_internal.h"
  41 #include "libavutil/avassert.h"
  42
  43 #include "cabac.h"
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 static const uint8_t rem6[52]={
  49 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  50 };
  51
  52 static const uint8_t div6[52]={
  53 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  54 };
  55
  56 static const enum PixelFormat hwaccel_pixfmt_list_h264_jpeg_420[] = {
  57     PIX_FMT_DXVA2_VLD,
  58     PIX_FMT_VAAPI_VLD,
  59     PIX_FMT_YUVJ420P,
  60     PIX_FMT_NONE
  61 };
  62
  63 void ff_h264_write_back_intra_pred_mode(H264Context *h){
  64     int8_t *mode= h->intra4x4_pred_mode + h->mb2br_xy[h->mb_xy];
  65
  66     AV_COPY32(mode, h->intra4x4_pred_mode_cache + 4 + 8*4);
  67     mode[4]= h->intra4x4_pred_mode_cache[7+8*3];
  68     mode[5]= h->intra4x4_pred_mode_cache[7+8*2];
  69     mode[6]= h->intra4x4_pred_mode_cache[7+8*1];
  70 }
  71
  72 /**
  73  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
  74  */
  75 int ff_h264_check_intra4x4_pred_mode(H264Context *h){
  76     MpegEncContext * const s = &h->s;
  77     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
  78     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
  79     int i;
  80
  81     if(!(h->top_samples_available&0x8000)){
  82         for(i=0; i<4; i++){
  83             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
  84             if(status<0){
  85                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
  86                 return -1;
  87             } else if(status){
  88                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
  89             }
  90         }
  91     }
  92
  93     if((h->left_samples_available&0x8888)!=0x8888){
  94         static const int mask[4]={0x8000,0x2000,0x80,0x20};
  95         for(i=0; i<4; i++){
  96             if(!(h->left_samples_available&mask[i])){
  97                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
  98                 if(status<0){
  99                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 100                     return -1;
 101                 } else if(status){
 102                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 103                 }
 104             }
 105         }
 106     }
 107
 108     return 0;
 109 } //FIXME cleanup like ff_h264_check_intra_pred_mode
 110
 111 /**
 112  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 113  */
 114 int ff_h264_check_intra_pred_mode(H264Context *h, int mode){
 115     MpegEncContext * const s = &h->s;
 116     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 117     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 118
 119     if(mode > 6U) {
 120         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 121         return -1;
 122     }
 123
 124     if(!(h->top_samples_available&0x8000)){
 125         mode= top[ mode ];
 126         if(mode<0){
 127             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 128             return -1;
 129         }
 130     }
 131
 132     if((h->left_samples_available&0x8080) != 0x8080){
 133         mode= left[ mode ];
 134         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 135             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 136         }
 137         if(mode<0){
 138             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 139             return -1;
 140         }
 141     }
 142
 143     return mode;
 144 }
 145
 146 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
 147     int i, si, di;
 148     uint8_t *dst;
 149     int bufidx;
 150
 151 //    src[0]&0x80;                //forbidden bit
 152     h->nal_ref_idc= src[0]>>5;
 153     h->nal_unit_type= src[0]&0x1F;
 154
 155     src++; length--;
 156 #if 0
 157     for(i=0; i<length; i++)
 158         printf("%2X ", src[i]);
 159 #endif
 160
 161 #if HAVE_FAST_UNALIGNED
 162 # if HAVE_FAST_64BIT
 163 #   define RS 7
 164     for(i=0; i+1<length; i+=9){
 165         if(!((~AV_RN64A(src+i) & (AV_RN64A(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
 166 # else
 167 #   define RS 3
 168     for(i=0; i+1<length; i+=5){
 169         if(!((~AV_RN32A(src+i) & (AV_RN32A(src+i) - 0x01000101U)) & 0x80008080U))
 170 # endif
 171             continue;
 172         if(i>0 && !src[i]) i--;
 173         while(src[i]) i++;
 174 #else
 175 #   define RS 0
 176     for(i=0; i+1<length; i+=2){
 177         if(src[i]) continue;
 178         if(i>0 && src[i-1]==0) i--;
 179 #endif
 180         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
 181             if(src[i+2]!=3){
 182                 /* startcode, so we must be past the end */
 183                 length=i;
 184             }
 185             break;
 186         }
 187         i-= RS;
 188     }
 189
 190     if(i>=length-1){ //no escaped 0
 191         *dst_length= length;
 192         *consumed= length+1; //+1 for the header
 193         return src;
 194     }
 195
 196     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
 197     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
 198     dst= h->rbsp_buffer[bufidx];
 199
 200     if (dst == NULL){
 201         return NULL;
 202     }
 203
 204 //printf("decoding esc\n");
 205     memcpy(dst, src, i);
 206     si=di=i;
 207     while(si+2<length){
 208         //remove escapes (very rare 1:2^22)
 209         if(src[si+2]>3){
 210             dst[di++]= src[si++];
 211             dst[di++]= src[si++];
 212         }else if(src[si]==0 && src[si+1]==0){
 213             if(src[si+2]==3){ //escape
 214                 dst[di++]= 0;
 215                 dst[di++]= 0;
 216                 si+=3;
 217                 continue;
 218             }else //next start code
 219                 goto nsc;
 220         }
 221
 222         dst[di++]= src[si++];
 223     }
 224     while(si<length)
 225         dst[di++]= src[si++];
 226 nsc:
 227
 228     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
 229
 230     *dst_length= di;
 231     *consumed= si + 1;//+1 for the header
 232 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
 233     return dst;
 234 }
 235
 236 /**
 237  * Identify the exact end of the bitstream
 238  * @return the length of the trailing, or 0 if damaged
 239  */
 240 static int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
 241     int v= *src;
 242     int r;
 243
 244     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
 245
 246     for(r=1; r<9; r++){
 247         if(v&1) return r;
 248         v>>=1;
 249     }
 250     return 0;
 251 }
 252
 253 #if 0
 254 /**
 255  * DCT transforms the 16 dc values.
 256  * @param qp quantization parameter ??? FIXME
 257  */
 258 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
 259 //    const int qmul= dequant_coeff[qp][0];
 260     int i;
 261     int temp[16]; //FIXME check if this is a good idea
 262     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
 263     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
 264
 265     for(i=0; i<4; i++){
 266         const int offset= y_offset[i];
 267         const int z0= block[offset+stride*0] + block[offset+stride*4];
 268         const int z1= block[offset+stride*0] - block[offset+stride*4];
 269         const int z2= block[offset+stride*1] - block[offset+stride*5];
 270         const int z3= block[offset+stride*1] + block[offset+stride*5];
 271
 272         temp[4*i+0]= z0+z3;
 273         temp[4*i+1]= z1+z2;
 274         temp[4*i+2]= z1-z2;
 275         temp[4*i+3]= z0-z3;
 276     }
 277
 278     for(i=0; i<4; i++){
 279         const int offset= x_offset[i];
 280         const int z0= temp[4*0+i] + temp[4*2+i];
 281         const int z1= temp[4*0+i] - temp[4*2+i];
 282         const int z2= temp[4*1+i] - temp[4*3+i];
 283         const int z3= temp[4*1+i] + temp[4*3+i];
 284
 285         block[stride*0 +offset]= (z0 + z3)>>1;
 286         block[stride*2 +offset]= (z1 + z2)>>1;
 287         block[stride*8 +offset]= (z1 - z2)>>1;
 288         block[stride*10+offset]= (z0 - z3)>>1;
 289     }
 290 }
 291 #endif
 292
 293 #undef xStride
 294 #undef stride
 295
 296 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){
 297     const int stride= 16*2;
 298     const int xStride= 16;
 299     int a,b,c,d,e;
 300
 301     a= block[stride*0 + xStride*0];
 302     b= block[stride*0 + xStride*1];
 303     c= block[stride*1 + xStride*0];
 304     d= block[stride*1 + xStride*1];
 305
 306     e= a-b;
 307     a= a+b;
 308     b= c-d;
 309     c= c+d;
 310
 311     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
 312     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
 313     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
 314     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
 315 }
 316
 317 #if 0
 318 static void chroma_dc_dct_c(DCTELEM *block){
 319     const int stride= 16*2;
 320     const int xStride= 16;
 321     int a,b,c,d,e;
 322
 323     a= block[stride*0 + xStride*0];
 324     b= block[stride*0 + xStride*1];
 325     c= block[stride*1 + xStride*0];
 326     d= block[stride*1 + xStride*1];
 327
 328     e= a-b;
 329     a= a+b;
 330     b= c-d;
 331     c= c+d;
 332
 333     block[stride*0 + xStride*0]= (a+c);
 334     block[stride*0 + xStride*1]= (e+b);
 335     block[stride*1 + xStride*0]= (a-c);
 336     block[stride*1 + xStride*1]= (e-b);
 337 }
 338 #endif
 339
 340 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
 341                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
 342                            int src_x_offset, int src_y_offset,
 343                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
 344     MpegEncContext * const s = &h->s;
 345     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
 346     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
 347     const int luma_xy= (mx&3) + ((my&3)<<2);
 348     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
 349     uint8_t * src_cb, * src_cr;
 350     int extra_width= h->emu_edge_width;
 351     int extra_height= h->emu_edge_height;
 352     int emu=0;
 353     const int full_mx= mx>>2;
 354     const int full_my= my>>2;
 355     const int pic_width  = 16*s->mb_width;
 356     const int pic_height = 16*s->mb_height >> MB_FIELD;
 357
 358     if(mx&7) extra_width -= 3;
 359     if(my&7) extra_height -= 3;
 360
 361     if(   full_mx < 0-extra_width
 362        || full_my < 0-extra_height
 363        || full_mx + 16/*FIXME*/ > pic_width + extra_width
 364        || full_my + 16/*FIXME*/ > pic_height + extra_height){
 365         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
 366             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
 367         emu=1;
 368     }
 369
 370     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
 371     if(!square){
 372         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
 373     }
 374
 375     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
 376
 377     if(MB_FIELD){
 378         // chroma offset when predicting from a field of opposite parity
 379         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
 380         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
 381     }
 382     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
 383     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
 384
 385     if(emu){
 386         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
 387             src_cb= s->edge_emu_buffer;
 388     }
 389     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
 390
 391     if(emu){
 392         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
 393             src_cr= s->edge_emu_buffer;
 394     }
 395     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
 396 }
 397
 398 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
 399                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
 400                            int x_offset, int y_offset,
 401                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
 402                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
 403                            int list0, int list1){
 404     MpegEncContext * const s = &h->s;
 405     qpel_mc_func *qpix_op=  qpix_put;
 406     h264_chroma_mc_func chroma_op= chroma_put;
 407
 408     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
 409     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
 410     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
 411     x_offset += 8*s->mb_x;
 412     y_offset += 8*(s->mb_y >> MB_FIELD);
 413
 414     if(list0){
 415         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
 416         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
 417                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
 418                            qpix_op, chroma_op);
 419
 420         qpix_op=  qpix_avg;
 421         chroma_op= chroma_avg;
 422     }
 423
 424     if(list1){
 425         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
 426         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
 427                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
 428                            qpix_op, chroma_op);
 429     }
 430 }
 431
 432 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
 433                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
 434                            int x_offset, int y_offset,
 435                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
 436                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
 437                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
 438                            int list0, int list1){
 439     MpegEncContext * const s = &h->s;
 440
 441     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
 442     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
 443     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
 444     x_offset += 8*s->mb_x;
 445     y_offset += 8*(s->mb_y >> MB_FIELD);
 446
 447     if(list0 && list1){
 448         /* don't optimize for luma-only case, since B-frames usually
 449          * use implicit weights => chroma too. */
 450         uint8_t *tmp_cb = s->obmc_scratchpad;
 451         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
 452         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
 453         int refn0 = h->ref_cache[0][ scan8[n] ];
 454         int refn1 = h->ref_cache[1][ scan8[n] ];
 455
 456         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
 457                     dest_y, dest_cb, dest_cr,
 458                     x_offset, y_offset, qpix_put, chroma_put);
 459         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
 460                     tmp_y, tmp_cb, tmp_cr,
 461                     x_offset, y_offset, qpix_put, chroma_put);
 462
 463         if(h->use_weight == 2){
 464             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
 465             int weight1 = 64 - weight0;
 466             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
 467             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
 468             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
 469         }else{
 470             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
 471                             h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
 472                             h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
 473             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
 474                             h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
 475                             h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
 476             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
 477                             h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
 478                             h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
 479         }
 480     }else{
 481         int list = list1 ? 1 : 0;
 482         int refn = h->ref_cache[list][ scan8[n] ];
 483         Picture *ref= &h->ref_list[list][refn];
 484         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
 485                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
 486                     qpix_put, chroma_put);
 487
 488         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
 489                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
 490         if(h->use_weight_chroma){
 491             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
 492                              h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
 493             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
 494                              h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
 495         }
 496     }
 497 }
 498
 499 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
 500                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
 501                            int x_offset, int y_offset,
 502                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
 503                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
 504                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
 505                            int list0, int list1){
 506     if((h->use_weight==2 && list0 && list1
 507         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
 508        || h->use_weight==1)
 509         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
 510                          x_offset, y_offset, qpix_put, chroma_put,
 511                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
 512     else
 513         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
 514                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
 515 }
 516
 517 static inline void prefetch_motion(H264Context *h, int list){
 518     /* fetch pixels for estimated mv 4 macroblocks ahead
 519      * optimized for 64byte cache lines */
 520     MpegEncContext * const s = &h->s;
 521     const int refn = h->ref_cache[list][scan8[0]];
 522     if(refn >= 0){
 523         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
 524         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
 525         uint8_t **src= h->ref_list[list][refn].data;
 526         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
 527         s->dsp.prefetch(src[0]+off, s->linesize, 4);
 528         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
 529         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
 530     }
 531 }
 532
 533 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
 534                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
 535                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
 536                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
 537     MpegEncContext * const s = &h->s;
 538     const int mb_xy= h->mb_xy;
 539     const int mb_type= s->current_picture.mb_type[mb_xy];
 540
 541     assert(IS_INTER(mb_type));
 542
 543     prefetch_motion(h, 0);
 544
 545     if(IS_16X16(mb_type)){
 546         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
 547                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
 548                 weight_op, weight_avg,
 549                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
 550     }else if(IS_16X8(mb_type)){
 551         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
 552                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
 553                 &weight_op[1], &weight_avg[1],
 554                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
 555         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
 556                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
 557                 &weight_op[1], &weight_avg[1],
 558                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
 559     }else if(IS_8X16(mb_type)){
 560         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
 561                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
 562                 &weight_op[2], &weight_avg[2],
 563                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
 564         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
 565                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
 566                 &weight_op[2], &weight_avg[2],
 567                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
 568     }else{
 569         int i;
 570
 571         assert(IS_8X8(mb_type));
 572
 573         for(i=0; i<4; i++){
 574             const int sub_mb_type= h->sub_mb_type[i];
 575             const int n= 4*i;
 576             int x_offset= (i&1)<<2;
 577             int y_offset= (i&2)<<1;
 578
 579             if(IS_SUB_8X8(sub_mb_type)){
 580                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
 581                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
 582                     &weight_op[3], &weight_avg[3],
 583                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
 584             }else if(IS_SUB_8X4(sub_mb_type)){
 585                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
 586                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
 587                     &weight_op[4], &weight_avg[4],
 588                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
 589                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
 590                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
 591                     &weight_op[4], &weight_avg[4],
 592                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
 593             }else if(IS_SUB_4X8(sub_mb_type)){
 594                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
 595                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
 596                     &weight_op[5], &weight_avg[5],
 597                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
 598                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
 599                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
 600                     &weight_op[5], &weight_avg[5],
 601                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
 602             }else{
 603                 int j;
 604                 assert(IS_SUB_4X4(sub_mb_type));
 605                 for(j=0; j<4; j++){
 606                     int sub_x_offset= x_offset + 2*(j&1);
 607                     int sub_y_offset= y_offset +   (j&2);
 608                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
 609                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
 610                         &weight_op[6], &weight_avg[6],
 611                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
 612                 }
 613             }
 614         }
 615     }
 616
 617     prefetch_motion(h, 1);
 618 }
 619
 620
 621 static void free_tables(H264Context *h, int free_rbsp){
 622     int i;
 623     H264Context *hx;
 624     av_freep(&h->intra4x4_pred_mode);
 625     av_freep(&h->chroma_pred_mode_table);
 626     av_freep(&h->cbp_table);
 627     av_freep(&h->mvd_table[0]);
 628     av_freep(&h->mvd_table[1]);
 629     av_freep(&h->direct_table);
 630     av_freep(&h->non_zero_count);
 631     av_freep(&h->slice_table_base);
 632     h->slice_table= NULL;
 633     av_freep(&h->list_counts);
 634
 635     av_freep(&h->mb2b_xy);
 636     av_freep(&h->mb2br_xy);
 637
 638     for(i = 0; i < MAX_THREADS; i++) {
 639         hx = h->thread_context[i];
 640         if(!hx) continue;
 641         av_freep(&hx->top_borders[1]);
 642         av_freep(&hx->top_borders[0]);
 643         av_freep(&hx->s.obmc_scratchpad);
 644         if (free_rbsp){
 645             av_freep(&hx->rbsp_buffer[1]);
 646             av_freep(&hx->rbsp_buffer[0]);
 647             hx->rbsp_buffer_size[0] = 0;
 648             hx->rbsp_buffer_size[1] = 0;
 649         }
 650         if (i) av_freep(&h->thread_context[i]);
 651     }
 652 }
 653
 654 static void init_dequant8_coeff_table(H264Context *h){
 655     int i,q,x;
 656     h->dequant8_coeff[0] = h->dequant8_buffer[0];
 657     h->dequant8_coeff[1] = h->dequant8_buffer[1];
 658
 659     for(i=0; i<2; i++ ){
 660         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
 661             h->dequant8_coeff[1] = h->dequant8_buffer[0];
 662             break;
 663         }
 664
 665         for(q=0; q<52; q++){
 666             int shift = div6[q];
 667             int idx = rem6[q];
 668             for(x=0; x<64; x++)
 669                 h->dequant8_coeff[i][q][(x>>3)|((x&7)<<3)] =
 670                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
 671                     h->pps.scaling_matrix8[i][x]) << shift;
 672         }
 673     }
 674 }
 675
 676 static void init_dequant4_coeff_table(H264Context *h){
 677     int i,j,q,x;
 678     for(i=0; i<6; i++ ){
 679         h->dequant4_coeff[i] = h->dequant4_buffer[i];
 680         for(j=0; j<i; j++){
 681             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
 682                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
 683                 break;
 684             }
 685         }
 686         if(j<i)
 687             continue;
 688
 689         for(q=0; q<52; q++){
 690             int shift = div6[q] + 2;
 691             int idx = rem6[q];
 692             for(x=0; x<16; x++)
 693                 h->dequant4_coeff[i][q][(x>>2)|((x<<2)&0xF)] =
 694                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
 695                     h->pps.scaling_matrix4[i][x]) << shift;
 696         }
 697     }
 698 }
 699
 700 static void init_dequant_tables(H264Context *h){
 701     int i,x;
 702     init_dequant4_coeff_table(h);
 703     if(h->pps.transform_8x8_mode)
 704         init_dequant8_coeff_table(h);
 705     if(h->sps.transform_bypass){
 706         for(i=0; i<6; i++)
 707             for(x=0; x<16; x++)
 708                 h->dequant4_coeff[i][0][x] = 1<<6;
 709         if(h->pps.transform_8x8_mode)
 710             for(i=0; i<2; i++)
 711                 for(x=0; x<64; x++)
 712                     h->dequant8_coeff[i][0][x] = 1<<6;
 713     }
 714 }
 715
 716
 717 int ff_h264_alloc_tables(H264Context *h){
 718     MpegEncContext * const s = &h->s;
 719     const int big_mb_num= s->mb_stride * (s->mb_height+1);
 720     const int row_mb_num= 2*s->mb_stride*s->avctx->thread_count;
 721     int x,y;
 722
 723     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, row_mb_num * 8  * sizeof(uint8_t), fail)
 724
 725     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 32 * sizeof(uint8_t), fail)
 726     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
 727     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
 728
 729     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
 730     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 16*row_mb_num * sizeof(uint8_t), fail);
 731     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 16*row_mb_num * sizeof(uint8_t), fail);
 732     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 4*big_mb_num * sizeof(uint8_t) , fail);
 733     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail)
 734
 735     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
 736     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
 737
 738     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy  , big_mb_num * sizeof(uint32_t), fail);
 739     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2br_xy , big_mb_num * sizeof(uint32_t), fail);
 740     for(y=0; y<s->mb_height; y++){
 741         for(x=0; x<s->mb_width; x++){
 742             const int mb_xy= x + y*s->mb_stride;
 743             const int b_xy = 4*x + 4*y*h->b_stride;
 744
 745             h->mb2b_xy [mb_xy]= b_xy;
 746             h->mb2br_xy[mb_xy]= 8*(FMO ? mb_xy : (mb_xy % (2*s->mb_stride)));
 747         }
 748     }
 749
 750     s->obmc_scratchpad = NULL;
 751
 752     if(!h->dequant4_coeff[0])
 753         init_dequant_tables(h);
 754
 755     return 0;
 756 fail:
 757     free_tables(h, 1);
 758     return -1;
 759 }
 760
 761 /**
 762  * Mimic alloc_tables(), but for every context thread.
 763  */
 764 static void clone_tables(H264Context *dst, H264Context *src, int i){
 765     MpegEncContext * const s = &src->s;
 766     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode + i*8*2*s->mb_stride;
 767     dst->non_zero_count           = src->non_zero_count;
 768     dst->slice_table              = src->slice_table;
 769     dst->cbp_table                = src->cbp_table;
 770     dst->mb2b_xy                  = src->mb2b_xy;
 771     dst->mb2br_xy                 = src->mb2br_xy;
 772     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
 773     dst->mvd_table[0]             = src->mvd_table[0] + i*8*2*s->mb_stride;
 774     dst->mvd_table[1]             = src->mvd_table[1] + i*8*2*s->mb_stride;
 775     dst->direct_table             = src->direct_table;
 776     dst->list_counts              = src->list_counts;
 777
 778     dst->s.obmc_scratchpad = NULL;
 779     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
 780 }
 781
 782 /**
 783  * Init context
 784  * Allocate buffers which are not shared amongst multiple threads.
 785  */
 786 static int context_init(H264Context *h){
 787     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
 788     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
 789
 790     h->ref_cache[0][scan8[5 ]+1] = h->ref_cache[0][scan8[7 ]+1] = h->ref_cache[0][scan8[13]+1] =
 791     h->ref_cache[1][scan8[5 ]+1] = h->ref_cache[1][scan8[7 ]+1] = h->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
 792
 793     return 0;
 794 fail:
 795     return -1; // free_tables will clean up for us
 796 }
 797
 798 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size);
 799
 800 static av_cold void common_init(H264Context *h){
 801     MpegEncContext * const s = &h->s;
 802
 803     s->width = s->avctx->width;
 804     s->height = s->avctx->height;
 805     s->codec_id= s->avctx->codec->id;
 806
 807     ff_h264dsp_init(&h->h264dsp);
 808     ff_h264_pred_init(&h->hpc, s->codec_id);
 809
 810     h->dequant_coeff_pps= -1;
 811     s->unrestricted_mv=1;
 812     s->decode=1; //FIXME
 813
 814     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
 815
 816     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
 817     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
 818 }
 819
 820 int ff_h264_decode_extradata(H264Context *h)
 821 {
 822     AVCodecContext *avctx = h->s.avctx;
 823
 824     if(*(char *)avctx->extradata == 1){
 825         int i, cnt, nalsize;
 826         unsigned char *p = avctx->extradata;
 827
 828         h->is_avc = 1;
 829
 830         if(avctx->extradata_size < 7) {
 831             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
 832             return -1;
 833         }
 834         /* sps and pps in the avcC always have length coded with 2 bytes,
 835            so put a fake nal_length_size = 2 while parsing them */
 836         h->nal_length_size = 2;
 837         // Decode sps from avcC
 838         cnt = *(p+5) & 0x1f; // Number of sps
 839         p += 6;
 840         for (i = 0; i < cnt; i++) {
 841             nalsize = AV_RB16(p) + 2;
 842             if(decode_nal_units(h, p, nalsize) < 0) {
 843                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
 844                 return -1;
 845             }
 846             p += nalsize;
 847         }
 848         // Decode pps from avcC
 849         cnt = *(p++); // Number of pps
 850         for (i = 0; i < cnt; i++) {
 851             nalsize = AV_RB16(p) + 2;
 852             if(decode_nal_units(h, p, nalsize)  != nalsize) {
 853                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
 854                 return -1;
 855             }
 856             p += nalsize;
 857         }
 858         // Now store right nal length size, that will be use to parse all other nals
 859         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
 860     } else {
 861         h->is_avc = 0;
 862         if(decode_nal_units(h, avctx->extradata, avctx->extradata_size) < 0)
 863             return -1;
 864     }
 865     return 0;
 866 }
 867
 868 av_cold int ff_h264_decode_init(AVCodecContext *avctx){
 869     H264Context *h= avctx->priv_data;
 870     MpegEncContext * const s = &h->s;
 871
 872     MPV_decode_defaults(s);
 873
 874     s->avctx = avctx;
 875     common_init(h);
 876
 877     s->out_format = FMT_H264;
 878     s->workaround_bugs= avctx->workaround_bugs;
 879
 880     // set defaults
 881 //    s->decode_mb= ff_h263_decode_mb;
 882     s->quarter_sample = 1;
 883     if(!avctx->has_b_frames)
 884     s->low_delay= 1;
 885
 886     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
 887
 888     ff_h264_decode_init_vlc();
 889
 890     h->thread_context[0] = h;
 891     h->outputed_poc = INT_MIN;
 892     h->prev_poc_msb= 1<<16;
 893     h->x264_build = -1;
 894     ff_h264_reset_sei(h);
 895     if(avctx->codec_id == CODEC_ID_H264){
 896         if(avctx->ticks_per_frame == 1){
 897             s->avctx->time_base.den *=2;
 898         }
 899         avctx->ticks_per_frame = 2;
 900     }
 901
 902     if(avctx->extradata_size > 0 && avctx->extradata &&
 903         ff_h264_decode_extradata(h))
 904         return -1;
 905
 906     if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames < h->sps.num_reorder_frames){
 907         s->avctx->has_b_frames = h->sps.num_reorder_frames;
 908         s->low_delay = 0;
 909     }
 910
 911     return 0;
 912 }
 913
 914 int ff_h264_frame_start(H264Context *h){
 915     MpegEncContext * const s = &h->s;
 916     int i;
 917
 918     if(MPV_frame_start(s, s->avctx) < 0)
 919         return -1;
 920     ff_er_frame_start(s);
 921     /*
 922      * MPV_frame_start uses pict_type to derive key_frame.
 923      * This is incorrect for H.264; IDR markings must be used.
 924      * Zero here; IDR markings per slice in frame or fields are ORed in later.
 925      * See decode_nal_units().
 926      */
 927     s->current_picture_ptr->key_frame= 0;
 928     s->current_picture_ptr->mmco_reset= 0;
 929
 930     assert(s->linesize && s->uvlinesize);
 931
 932     for(i=0; i<16; i++){
 933         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
 934         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
 935     }
 936     for(i=0; i<4; i++){
 937         h->block_offset[16+i]=
 938         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
 939         h->block_offset[24+16+i]=
 940         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
 941     }
 942
 943     /* can't be in alloc_tables because linesize isn't known there.
 944      * FIXME: redo bipred weight to not require extra buffer? */
 945     for(i = 0; i < s->avctx->thread_count; i++)
 946         if(h->thread_context[i] && !h->thread_context[i]->s.obmc_scratchpad)
 947             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
 948
 949     /* some macroblocks can be accessed before they're available in case of lost slices, mbaff or threading*/
 950     memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
 951
 952 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
 953
 954     // We mark the current picture as non-reference after allocating it, so
 955     // that if we break out due to an error it can be released automatically
 956     // in the next MPV_frame_start().
 957     // SVQ3 as well as most other codecs have only last/next/current and thus
 958     // get released even with set reference, besides SVQ3 and others do not
 959     // mark frames as reference later "naturally".
 960     if(s->codec_id != CODEC_ID_SVQ3)
 961         s->current_picture_ptr->reference= 0;
 962
 963     s->current_picture_ptr->field_poc[0]=
 964     s->current_picture_ptr->field_poc[1]= INT_MAX;
 965     assert(s->current_picture_ptr->long_ref==0);
 966
 967     return 0;
 968 }
 969
 970 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
 971     MpegEncContext * const s = &h->s;
 972     uint8_t *top_border;
 973     int top_idx = 1;
 974
 975     src_y  -=   linesize;
 976     src_cb -= uvlinesize;
 977     src_cr -= uvlinesize;
 978
 979     if(!simple && FRAME_MBAFF){
 980         if(s->mb_y&1){
 981             if(!MB_MBAFF){
 982                 top_border = h->top_borders[0][s->mb_x];
 983                 AV_COPY128(top_border, src_y + 15*linesize);
 984                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
 985                     AV_COPY64(top_border+16, src_cb+7*uvlinesize);
 986                     AV_COPY64(top_border+24, src_cr+7*uvlinesize);
 987                 }
 988             }
 989         }else if(MB_MBAFF){
 990             top_idx = 0;
 991         }else
 992             return;
 993     }
 994
 995     top_border = h->top_borders[top_idx][s->mb_x];
 996     // There are two lines saved, the line above the the top macroblock of a pair,
 997     // and the line above the bottom macroblock
 998     AV_COPY128(top_border, src_y + 16*linesize);
 999
1000     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1001         AV_COPY64(top_border+16, src_cb+8*uvlinesize);
1002         AV_COPY64(top_border+24, src_cr+8*uvlinesize);
1003     }
1004 }
1005
1006 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
1007     MpegEncContext * const s = &h->s;
1008     int deblock_left;
1009     int deblock_top;
1010     int top_idx = 1;
1011     uint8_t *top_border_m1;
1012     uint8_t *top_border;
1013
1014     if(!simple && FRAME_MBAFF){
1015         if(s->mb_y&1){
1016             if(!MB_MBAFF)
1017                 return;
1018         }else{
1019             top_idx = MB_MBAFF ? 0 : 1;
1020         }
1021     }
1022
1023     if(h->deblocking_filter == 2) {
1024         deblock_left = h->left_type[0];
1025         deblock_top  = h->top_type;
1026     } else {
1027         deblock_left = (s->mb_x > 0);
1028         deblock_top =  (s->mb_y > !!MB_FIELD);
1029     }
1030
1031     src_y  -=   linesize + 1;
1032     src_cb -= uvlinesize + 1;
1033     src_cr -= uvlinesize + 1;
1034
1035     top_border_m1 = h->top_borders[top_idx][s->mb_x-1];
1036     top_border    = h->top_borders[top_idx][s->mb_x];
1037
1038 #define XCHG(a,b,xchg)\
1039 if (xchg) AV_SWAP64(b,a);\
1040 else      AV_COPY64(b,a);
1041
1042     if(deblock_top){
1043         if(deblock_left){
1044             XCHG(top_border_m1+8, src_y -7, 1);
1045         }
1046         XCHG(top_border+0, src_y +1, xchg);
1047         XCHG(top_border+8, src_y +9, 1);
1048         if(s->mb_x+1 < s->mb_width){
1049             XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +17, 1);
1050         }
1051     }
1052
1053     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1054         if(deblock_top){
1055             if(deblock_left){
1056                 XCHG(top_border_m1+16, src_cb -7, 1);
1057                 XCHG(top_border_m1+24, src_cr -7, 1);
1058             }
1059             XCHG(top_border+16, src_cb+1, 1);
1060             XCHG(top_border+24, src_cr+1, 1);
1061         }
1062     }
1063 }
1064
1065 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
1066     MpegEncContext * const s = &h->s;
1067     const int mb_x= s->mb_x;
1068     const int mb_y= s->mb_y;
1069     const int mb_xy= h->mb_xy;
1070     const int mb_type= s->current_picture.mb_type[mb_xy];
1071     uint8_t  *dest_y, *dest_cb, *dest_cr;
1072     int linesize, uvlinesize /*dct_offset*/;
1073     int i;
1074     int *block_offset = &h->block_offset[0];
1075     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
1076     /* is_h264 should always be true if SVQ3 is disabled. */
1077     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
1078     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
1079     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
1080
1081     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
1082     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
1083     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
1084
1085     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1086     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
1087
1088     h->list_counts[mb_xy]= h->list_count;
1089
1090     if (!simple && MB_FIELD) {
1091         linesize   = h->mb_linesize   = s->linesize * 2;
1092         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
1093         block_offset = &h->block_offset[24];
1094         if(mb_y&1){ //FIXME move out of this function?
1095             dest_y -= s->linesize*15;
1096             dest_cb-= s->uvlinesize*7;
1097             dest_cr-= s->uvlinesize*7;
1098         }
1099         if(FRAME_MBAFF) {
1100             int list;
1101             for(list=0; list<h->list_count; list++){
1102                 if(!USES_LIST(mb_type, list))
1103                     continue;
1104                 if(IS_16X16(mb_type)){
1105                     int8_t *ref = &h->ref_cache[list][scan8[0]];
1106                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
1107                 }else{
1108                     for(i=0; i<16; i+=4){
1109                         int ref = h->ref_cache[list][scan8[i]];
1110                         if(ref >= 0)
1111                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
1112                     }
1113                 }
1114             }
1115         }
1116     } else {
1117         linesize   = h->mb_linesize   = s->linesize;
1118         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
1119 //        dct_offset = s->linesize * 16;
1120     }
1121
1122     if (!simple && IS_INTRA_PCM(mb_type)) {
1123         for (i=0; i<16; i++) {
1124             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
1125         }
1126         for (i=0; i<8; i++) {
1127             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
1128             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
1129         }
1130     } else {
1131         if(IS_INTRA(mb_type)){
1132             if(h->deblocking_filter)
1133                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
1134
1135             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1136                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
1137                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
1138             }
1139
1140             if(IS_INTRA4x4(mb_type)){
1141                 if(simple || !s->encoding){
1142                     if(IS_8x8DCT(mb_type)){
1143                         if(transform_bypass){
1144                             idct_dc_add =
1145                             idct_add    = s->dsp.add_pixels8;
1146                         }else{
1147                             idct_dc_add = h->h264dsp.h264_idct8_dc_add;
1148                             idct_add    = h->h264dsp.h264_idct8_add;
1149                         }
1150                         for(i=0; i<16; i+=4){
1151                             uint8_t * const ptr= dest_y + block_offset[i];
1152                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
1153                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
1154                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
1155                             }else{
1156                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
1157                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
1158                                                             (h->topright_samples_available<<i)&0x4000, linesize);
1159                                 if(nnz){
1160                                     if(nnz == 1 && h->mb[i*16])
1161                                         idct_dc_add(ptr, h->mb + i*16, linesize);
1162                                     else
1163                                         idct_add   (ptr, h->mb + i*16, linesize);
1164                                 }
1165                             }
1166                         }
1167                     }else{
1168                         if(transform_bypass){
1169                             idct_dc_add =
1170                             idct_add    = s->dsp.add_pixels4;
1171                         }else{
1172                             idct_dc_add = h->h264dsp.h264_idct_dc_add;
1173                             idct_add    = h->h264dsp.h264_idct_add;
1174                         }
1175                         for(i=0; i<16; i++){
1176                             uint8_t * const ptr= dest_y + block_offset[i];
1177                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
1178
1179                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
1180                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
1181                             }else{
1182                                 uint8_t *topright;
1183                                 int nnz, tr;
1184                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
1185                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
1186                                     assert(mb_y || linesize <= block_offset[i]);
1187                                     if(!topright_avail){
1188                                         tr= ptr[3 - linesize]*0x01010101;
1189                                         topright= (uint8_t*) &tr;
1190                                     }else
1191                                         topright= ptr + 4 - linesize;
1192                                 }else
1193                                     topright= NULL;
1194
1195                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
1196                                 nnz = h->non_zero_count_cache[ scan8[i] ];
1197                                 if(nnz){
1198                                     if(is_h264){
1199                                         if(nnz == 1 && h->mb[i*16])
1200                                             idct_dc_add(ptr, h->mb + i*16, linesize);
1201                                         else
1202                                             idct_add   (ptr, h->mb + i*16, linesize);
1203                                     }else
1204                                         ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
1205                                 }
1206                             }
1207                         }
1208                     }
1209                 }
1210             }else{
1211                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
1212                 if(is_h264){
1213                     if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX] ]){
1214                         if(!transform_bypass)
1215                             h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
1216                         else{
1217                             static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
1218                                                                     8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
1219                             for(i = 0; i < 16; i++)
1220                                 h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
1221                         }
1222                     }
1223                 }else
1224                     ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
1225             }
1226             if(h->deblocking_filter)
1227                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
1228         }else if(is_h264){
1229             hl_motion(h, dest_y, dest_cb, dest_cr,
1230                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
1231                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
1232                       h->h264dsp.weight_h264_pixels_tab, h->h264dsp.biweight_h264_pixels_tab);
1233         }
1234
1235
1236         if(!IS_INTRA4x4(mb_type)){
1237             if(is_h264){
1238                 if(IS_INTRA16x16(mb_type)){
1239                     if(transform_bypass){
1240                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
1241                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
1242                         }else{
1243                             for(i=0; i<16; i++){
1244                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
1245                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
1246                             }
1247                         }
1248                     }else{
1249                          h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
1250                     }
1251                 }else if(h->cbp&15){
1252                     if(transform_bypass){
1253                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
1254                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
1255                         for(i=0; i<16; i+=di){
1256                             if(h->non_zero_count_cache[ scan8[i] ]){
1257                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
1258                             }
1259                         }
1260                     }else{
1261                         if(IS_8x8DCT(mb_type)){
1262                             h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
1263                         }else{
1264                             h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
1265                         }
1266                     }
1267                 }
1268             }else{
1269                 for(i=0; i<16; i++){
1270                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
1271                         uint8_t * const ptr= dest_y + block_offset[i];
1272                         ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
1273                     }
1274                 }
1275             }
1276         }
1277
1278         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
1279             uint8_t *dest[2] = {dest_cb, dest_cr};
1280             if(transform_bypass){
1281                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
1282                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
1283                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
1284                 }else{
1285                     idct_add = s->dsp.add_pixels4;
1286                     for(i=16; i<16+8; i++){
1287                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
1288                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
1289                     }
1290                 }
1291             }else{
1292                 if(is_h264){
1293                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
1294                         chroma_dc_dequant_idct_c(h->mb + 16*16     , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
1295                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
1296                         chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
1297                     h->h264dsp.h264_idct_add8(dest, block_offset,
1298                                               h->mb, uvlinesize,
1299                                               h->non_zero_count_cache);
1300                 }else{
1301                     chroma_dc_dequant_idct_c(h->mb + 16*16     , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
1302                     chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
1303                     for(i=16; i<16+8; i++){
1304                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
1305                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
1306                             ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[s->qscale + 12] - 12, 2);
1307                         }
1308                     }
1309                 }
1310             }
1311         }
1312     }
1313     if(h->cbp || IS_INTRA(mb_type))
1314         s->dsp.clear_blocks(h->mb);
1315 }
1316
1317 /**
1318  * Process a macroblock; this case avoids checks for expensive uncommon cases.
1319  */
1320 static void hl_decode_mb_simple(H264Context *h){
1321     hl_decode_mb_internal(h, 1);
1322 }
1323
1324 /**
1325  * Process a macroblock; this handles edge cases, such as interlacing.
1326  */
1327 static void av_noinline hl_decode_mb_complex(H264Context *h){
1328     hl_decode_mb_internal(h, 0);
1329 }
1330
1331 void ff_h264_hl_decode_mb(H264Context *h){
1332     MpegEncContext * const s = &h->s;
1333     const int mb_xy= h->mb_xy;
1334     const int mb_type= s->current_picture.mb_type[mb_xy];
1335     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
1336
1337     if (is_complex)
1338         hl_decode_mb_complex(h);
1339     else hl_decode_mb_simple(h);
1340 }
1341
1342 static int pred_weight_table(H264Context *h){
1343     MpegEncContext * const s = &h->s;
1344     int list, i;
1345     int luma_def, chroma_def;
1346
1347     h->use_weight= 0;
1348     h->use_weight_chroma= 0;
1349     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
1350     if(CHROMA)
1351         h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
1352     luma_def = 1<<h->luma_log2_weight_denom;
1353     chroma_def = 1<<h->chroma_log2_weight_denom;
1354
1355     for(list=0; list<2; list++){
1356         h->luma_weight_flag[list]   = 0;
1357         h->chroma_weight_flag[list] = 0;
1358         for(i=0; i<h->ref_count[list]; i++){
1359             int luma_weight_flag, chroma_weight_flag;
1360
1361             luma_weight_flag= get_bits1(&s->gb);
1362             if(luma_weight_flag){
1363                 h->luma_weight[i][list][0]= get_se_golomb(&s->gb);
1364                 h->luma_weight[i][list][1]= get_se_golomb(&s->gb);
1365                 if(   h->luma_weight[i][list][0] != luma_def
1366                    || h->luma_weight[i][list][1] != 0) {
1367                     h->use_weight= 1;
1368                     h->luma_weight_flag[list]= 1;
1369                 }
1370             }else{
1371                 h->luma_weight[i][list][0]= luma_def;
1372                 h->luma_weight[i][list][1]= 0;
1373             }
1374
1375             if(CHROMA){
1376                 chroma_weight_flag= get_bits1(&s->gb);
1377                 if(chroma_weight_flag){
1378                     int j;
1379                     for(j=0; j<2; j++){
1380                         h->chroma_weight[i][list][j][0]= get_se_golomb(&s->gb);
1381                         h->chroma_weight[i][list][j][1]= get_se_golomb(&s->gb);
1382                         if(   h->chroma_weight[i][list][j][0] != chroma_def
1383                            || h->chroma_weight[i][list][j][1] != 0) {
1384                             h->use_weight_chroma= 1;
1385                             h->chroma_weight_flag[list]= 1;
1386                         }
1387                     }
1388                 }else{
1389                     int j;
1390                     for(j=0; j<2; j++){
1391                         h->chroma_weight[i][list][j][0]= chroma_def;
1392                         h->chroma_weight[i][list][j][1]= 0;
1393                     }
1394                 }
1395             }
1396         }
1397         if(h->slice_type_nos != FF_B_TYPE) break;
1398     }
1399     h->use_weight= h->use_weight || h->use_weight_chroma;
1400     return 0;
1401 }
1402
1403 /**
1404  * Initialize implicit_weight table.
1405  * @param field  0/1 initialize the weight for interlaced MBAFF
1406  *                -1 initializes the rest
1407  */
1408 static void implicit_weight_table(H264Context *h, int field){
1409     MpegEncContext * const s = &h->s;
1410     int ref0, ref1, i, cur_poc, ref_start, ref_count0, ref_count1;
1411
1412     for (i = 0; i < 2; i++) {
1413         h->luma_weight_flag[i]   = 0;
1414         h->chroma_weight_flag[i] = 0;
1415     }
1416
1417     if(field < 0){
1418         cur_poc = s->current_picture_ptr->poc;
1419     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1 && !FRAME_MBAFF
1420        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
1421         h->use_weight= 0;
1422         h->use_weight_chroma= 0;
1423         return;
1424     }
1425         ref_start= 0;
1426         ref_count0= h->ref_count[0];
1427         ref_count1= h->ref_count[1];
1428     }else{
1429         cur_poc = s->current_picture_ptr->field_poc[field];
1430         ref_start= 16;
1431         ref_count0= 16+2*h->ref_count[0];
1432         ref_count1= 16+2*h->ref_count[1];
1433     }
1434
1435     h->use_weight= 2;
1436     h->use_weight_chroma= 2;
1437     h->luma_log2_weight_denom= 5;
1438     h->chroma_log2_weight_denom= 5;
1439
1440     for(ref0=ref_start; ref0 < ref_count0; ref0++){
1441         int poc0 = h->ref_list[0][ref0].poc;
1442         for(ref1=ref_start; ref1 < ref_count1; ref1++){
1443             int poc1 = h->ref_list[1][ref1].poc;
1444             int td = av_clip(poc1 - poc0, -128, 127);
1445             int w= 32;
1446             if(td){
1447                 int tb = av_clip(cur_poc - poc0, -128, 127);
1448                 int tx = (16384 + (FFABS(td) >> 1)) / td;
1449                 int dist_scale_factor = (tb*tx + 32) >> 8;
1450                 if(dist_scale_factor >= -64 && dist_scale_factor <= 128)
1451                     w = 64 - dist_scale_factor;
1452             }
1453             if(field<0){
1454                 h->implicit_weight[ref0][ref1][0]=
1455                 h->implicit_weight[ref0][ref1][1]= w;
1456             }else{
1457                 h->implicit_weight[ref0][ref1][field]=w;
1458             }
1459         }
1460     }
1461 }
1462
1463 /**
1464  * instantaneous decoder refresh.
1465  */
1466 static void idr(H264Context *h){
1467     ff_h264_remove_all_refs(h);
1468     h->prev_frame_num= 0;
1469     h->prev_frame_num_offset= 0;
1470     h->prev_poc_msb=
1471     h->prev_poc_lsb= 0;
1472 }
1473
1474 /* forget old pics after a seek */
1475 static void flush_dpb(AVCodecContext *avctx){
1476     H264Context *h= avctx->priv_data;
1477     int i;
1478     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
1479         if(h->delayed_pic[i])
1480             h->delayed_pic[i]->reference= 0;
1481         h->delayed_pic[i]= NULL;
1482     }
1483     h->outputed_poc= INT_MIN;
1484     h->prev_interlaced_frame = 1;
1485     idr(h);
1486     if(h->s.current_picture_ptr)
1487         h->s.current_picture_ptr->reference= 0;
1488     h->s.first_field= 0;
1489     ff_h264_reset_sei(h);
1490     ff_mpeg_flush(avctx);
1491 }
1492
1493 static int init_poc(H264Context *h){
1494     MpegEncContext * const s = &h->s;
1495     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
1496     int field_poc[2];
1497     Picture *cur = s->current_picture_ptr;
1498
1499     h->frame_num_offset= h->prev_frame_num_offset;
1500     if(h->frame_num < h->prev_frame_num)
1501         h->frame_num_offset += max_frame_num;
1502
1503     if(h->sps.poc_type==0){
1504         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
1505
1506         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
1507             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
1508         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
1509             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
1510         else
1511             h->poc_msb = h->prev_poc_msb;
1512 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
1513         field_poc[0] =
1514         field_poc[1] = h->poc_msb + h->poc_lsb;
1515         if(s->picture_structure == PICT_FRAME)
1516             field_poc[1] += h->delta_poc_bottom;
1517     }else if(h->sps.poc_type==1){
1518         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
1519         int i;
1520
1521         if(h->sps.poc_cycle_length != 0)
1522             abs_frame_num = h->frame_num_offset + h->frame_num;
1523         else
1524             abs_frame_num = 0;
1525
1526         if(h->nal_ref_idc==0 && abs_frame_num > 0)
1527             abs_frame_num--;
1528
1529         expected_delta_per_poc_cycle = 0;
1530         for(i=0; i < h->sps.poc_cycle_length; i++)
1531             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
1532
1533         if(abs_frame_num > 0){
1534             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
1535             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
1536
1537             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
1538             for(i = 0; i <= frame_num_in_poc_cycle; i++)
1539                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
1540         } else
1541             expectedpoc = 0;
1542
1543         if(h->nal_ref_idc == 0)
1544             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
1545
1546         field_poc[0] = expectedpoc + h->delta_poc[0];
1547         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
1548
1549         if(s->picture_structure == PICT_FRAME)
1550             field_poc[1] += h->delta_poc[1];
1551     }else{
1552         int poc= 2*(h->frame_num_offset + h->frame_num);
1553
1554         if(!h->nal_ref_idc)
1555             poc--;
1556
1557         field_poc[0]= poc;
1558         field_poc[1]= poc;
1559     }
1560
1561     if(s->picture_structure != PICT_BOTTOM_FIELD)
1562         s->current_picture_ptr->field_poc[0]= field_poc[0];
1563     if(s->picture_structure != PICT_TOP_FIELD)
1564         s->current_picture_ptr->field_poc[1]= field_poc[1];
1565     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
1566
1567     return 0;
1568 }
1569
1570
1571 /**
1572  * initialize scan tables
1573  */
1574 static void init_scan_tables(H264Context *h){
1575     int i;
1576     for(i=0; i<16; i++){
1577 #define T(x) (x>>2) | ((x<<2) & 0xF)
1578         h->zigzag_scan[i] = T(zigzag_scan[i]);
1579         h-> field_scan[i] = T( field_scan[i]);
1580 #undef T
1581     }
1582     for(i=0; i<64; i++){
1583 #define T(x) (x>>3) | ((x&7)<<3)
1584         h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
1585         h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
1586         h->field_scan8x8[i]        = T(field_scan8x8[i]);
1587         h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
1588 #undef T
1589     }
1590     if(h->sps.transform_bypass){ //FIXME same ugly
1591         h->zigzag_scan_q0          = zigzag_scan;
1592         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
1593         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
1594         h->field_scan_q0           = field_scan;
1595         h->field_scan8x8_q0        = field_scan8x8;
1596         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
1597     }else{
1598         h->zigzag_scan_q0          = h->zigzag_scan;
1599         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
1600         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
1601         h->field_scan_q0           = h->field_scan;
1602         h->field_scan8x8_q0        = h->field_scan8x8;
1603         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
1604     }
1605 }
1606
1607 static void field_end(H264Context *h){
1608     MpegEncContext * const s = &h->s;
1609     AVCodecContext * const avctx= s->avctx;
1610     s->mb_y= 0;
1611
1612     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
1613     s->current_picture_ptr->pict_type= s->pict_type;
1614
1615     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
1616         ff_vdpau_h264_set_reference_frames(s);
1617
1618     if(!s->dropable) {
1619         ff_h264_execute_ref_pic_marking(h, h->mmco, h->mmco_index);
1620         h->prev_poc_msb= h->poc_msb;
1621         h->prev_poc_lsb= h->poc_lsb;
1622     }
1623     h->prev_frame_num_offset= h->frame_num_offset;
1624     h->prev_frame_num= h->frame_num;
1625
1626     if (avctx->hwaccel) {
1627         if (avctx->hwaccel->end_frame(avctx) < 0)
1628             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
1629     }
1630
1631     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
1632         ff_vdpau_h264_picture_complete(s);
1633
1634     /*
1635      * FIXME: Error handling code does not seem to support interlaced
1636      * when slices span multiple rows
1637      * The ff_er_add_slice calls don't work right for bottom
1638      * fields; they cause massive erroneous error concealing
1639      * Error marking covers both fields (top and bottom).
1640      * This causes a mismatched s->error_count
1641      * and a bad error table. Further, the error count goes to
1642      * INT_MAX when called for bottom field, because mb_y is
1643      * past end by one (callers fault) and resync_mb_y != 0
1644      * causes problems for the first MB line, too.
1645      */
1646     if (!FIELD_PICTURE)
1647         ff_er_frame_end(s);
1648
1649     MPV_frame_end(s);
1650
1651     h->current_slice=0;
1652 }
1653
1654 /**
1655  * Replicate H264 "master" context to thread contexts.
1656  */
1657 static void clone_slice(H264Context *dst, H264Context *src)
1658 {
1659     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
1660     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
1661     dst->s.current_picture      = src->s.current_picture;
1662     dst->s.linesize             = src->s.linesize;
1663     dst->s.uvlinesize           = src->s.uvlinesize;
1664     dst->s.first_field          = src->s.first_field;
1665
1666     dst->prev_poc_msb           = src->prev_poc_msb;
1667     dst->prev_poc_lsb           = src->prev_poc_lsb;
1668     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
1669     dst->prev_frame_num         = src->prev_frame_num;
1670     dst->short_ref_count        = src->short_ref_count;
1671
1672     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
1673     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
1674     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
1675     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
1676
1677     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
1678     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
1679 }
1680
1681 /**
1682  * decodes a slice header.
1683  * This will also call MPV_common_init() and frame_start() as needed.
1684  *
1685  * @param h h264context
1686  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
1687  *
1688  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
1689  */
1690 static int decode_slice_header(H264Context *h, H264Context *h0){
1691     MpegEncContext * const s = &h->s;
1692     MpegEncContext * const s0 = &h0->s;
1693     unsigned int first_mb_in_slice;
1694     unsigned int pps_id;
1695     int num_ref_idx_active_override_flag;
1696     unsigned int slice_type, tmp, i, j;
1697     int default_ref_list_done = 0;
1698     int last_pic_structure;
1699
1700     s->dropable= h->nal_ref_idc == 0;
1701
1702     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
1703         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
1704         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
1705     }else{
1706         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
1707         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
1708     }
1709
1710     first_mb_in_slice= get_ue_golomb(&s->gb);
1711
1712     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
1713         if(h0->current_slice && FIELD_PICTURE){
1714             field_end(h);
1715         }
1716
1717         h0->current_slice = 0;
1718         if (!s0->first_field)
1719             s->current_picture_ptr= NULL;
1720     }
1721
1722     slice_type= get_ue_golomb_31(&s->gb);
1723     if(slice_type > 9){
1724         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
1725         return -1;
1726     }
1727     if(slice_type > 4){
1728         slice_type -= 5;
1729         h->slice_type_fixed=1;
1730     }else
1731         h->slice_type_fixed=0;
1732
1733     slice_type= golomb_to_pict_type[ slice_type ];
1734     if (slice_type == FF_I_TYPE
1735         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
1736         default_ref_list_done = 1;
1737     }
1738     h->slice_type= slice_type;
1739     h->slice_type_nos= slice_type & 3;
1740
1741     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
1742
1743     pps_id= get_ue_golomb(&s->gb);
1744     if(pps_id>=MAX_PPS_COUNT){
1745         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
1746         return -1;
1747     }
1748     if(!h0->pps_buffers[pps_id]) {
1749         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
1750         return -1;
1751     }
1752     h->pps= *h0->pps_buffers[pps_id];
1753
1754     if(!h0->sps_buffers[h->pps.sps_id]) {
1755         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
1756         return -1;
1757     }
1758     h->sps = *h0->sps_buffers[h->pps.sps_id];
1759
1760     s->avctx->profile = h->sps.profile_idc;
1761     s->avctx->level   = h->sps.level_idc;
1762     s->avctx->refs    = h->sps.ref_frame_count;
1763
1764     if(h == h0 && h->dequant_coeff_pps != pps_id){
1765         h->dequant_coeff_pps = pps_id;
1766         init_dequant_tables(h);
1767     }
1768
1769     s->mb_width= h->sps.mb_width;
1770     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
1771
1772     h->b_stride=  s->mb_width*4;
1773
1774     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
1775     if(h->sps.frame_mbs_only_flag)
1776         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
1777     else
1778         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 7);
1779
1780     if (s->context_initialized
1781         && (   s->width != s->avctx->width || s->height != s->avctx->height
1782             || av_cmp_q(h->sps.sar, s->avctx->sample_aspect_ratio))) {
1783         if(h != h0)
1784             return -1;   // width / height changed during parallelized decoding
1785         free_tables(h, 0);
1786         flush_dpb(s->avctx);
1787         MPV_common_end(s);
1788     }
1789     if (!s->context_initialized) {
1790         if(h != h0)
1791             return -1;  // we cant (re-)initialize context during parallel decoding
1792
1793         avcodec_set_dimensions(s->avctx, s->width, s->height);
1794         s->avctx->sample_aspect_ratio= h->sps.sar;
1795         av_assert0(s->avctx->sample_aspect_ratio.den);
1796
1797         if(h->sps.video_signal_type_present_flag){
1798             s->avctx->color_range = h->sps.full_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
1799             if(h->sps.colour_description_present_flag){
1800                 s->avctx->color_primaries = h->sps.color_primaries;
1801                 s->avctx->color_trc       = h->sps.color_trc;
1802                 s->avctx->colorspace      = h->sps.colorspace;
1803             }
1804         }
1805
1806         if(h->sps.timing_info_present_flag){
1807             int64_t den= h->sps.time_scale;
1808             if(h->x264_build < 44U)
1809                 den *= 2;
1810             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
1811                       h->sps.num_units_in_tick, den, 1<<30);
1812         }
1813         s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
1814                                                  s->avctx->codec->pix_fmts ?
1815                                                  s->avctx->codec->pix_fmts :
1816                                                  s->avctx->color_range == AVCOL_RANGE_JPEG ?
1817                                                  hwaccel_pixfmt_list_h264_jpeg_420 :
1818                                                  ff_hwaccel_pixfmt_list_420);
1819         s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
1820
1821         if (MPV_common_init(s) < 0)
1822             return -1;
1823         s->first_field = 0;
1824         h->prev_interlaced_frame = 1;
1825
1826         init_scan_tables(h);
1827         ff_h264_alloc_tables(h);
1828
1829         for(i = 1; i < s->avctx->thread_count; i++) {
1830             H264Context *c;
1831             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
1832             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
1833             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
1834             c->h264dsp = h->h264dsp;
1835             c->sps = h->sps;
1836             c->pps = h->pps;
1837             init_scan_tables(c);
1838             clone_tables(c, h, i);
1839         }
1840
1841         for(i = 0; i < s->avctx->thread_count; i++)
1842             if(context_init(h->thread_context[i]) < 0)
1843                 return -1;
1844     }
1845
1846     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
1847
1848     h->mb_mbaff = 0;
1849     h->mb_aff_frame = 0;
1850     last_pic_structure = s0->picture_structure;
1851     if(h->sps.frame_mbs_only_flag){
1852         s->picture_structure= PICT_FRAME;
1853     }else{
1854         if(get_bits1(&s->gb)) { //field_pic_flag
1855             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
1856         } else {
1857             s->picture_structure= PICT_FRAME;
1858             h->mb_aff_frame = h->sps.mb_aff;
1859         }
1860     }
1861     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
1862
1863     if(h0->current_slice == 0){
1864         while(h->frame_num !=  h->prev_frame_num &&
1865               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
1866             Picture *prev = h->short_ref_count ? h->short_ref[0] : NULL;
1867             av_log(h->s.avctx, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
1868             if (ff_h264_frame_start(h) < 0)
1869                 return -1;
1870             h->prev_frame_num++;
1871             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
1872             s->current_picture_ptr->frame_num= h->prev_frame_num;
1873             ff_generate_sliding_window_mmcos(h);
1874             ff_h264_execute_ref_pic_marking(h, h->mmco, h->mmco_index);
1875             /* Error concealment: if a ref is missing, copy the previous ref in its place.
1876              * FIXME: avoiding a memcpy would be nice, but ref handling makes many assumptions
1877              * about there being no actual duplicates.
1878              * FIXME: this doesn't copy padding for out-of-frame motion vectors.  Given we're
1879              * concealing a lost frame, this probably isn't noticable by comparison, but it should
1880              * be fixed. */
1881             if (h->short_ref_count) {
1882                 if (prev) {
1883                     av_image_copy(h->short_ref[0]->data, h->short_ref[0]->linesize,
1884                                   (const uint8_t**)prev->data, prev->linesize,
1885                                   s->avctx->pix_fmt, s->mb_width*16, s->mb_height*16);
1886                     h->short_ref[0]->poc = prev->poc+2;
1887                 }
1888                 h->short_ref[0]->frame_num = h->prev_frame_num;
1889             }
1890         }
1891
1892         /* See if we have a decoded first field looking for a pair... */
1893         if (s0->first_field) {
1894             assert(s0->current_picture_ptr);
1895             assert(s0->current_picture_ptr->data[0]);
1896             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
1897
1898             /* figure out if we have a complementary field pair */
1899             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
1900                 /*
1901                  * Previous field is unmatched. Don't display it, but let it
1902                  * remain for reference if marked as such.
1903                  */
1904                 s0->current_picture_ptr = NULL;
1905                 s0->first_field = FIELD_PICTURE;
1906
1907             } else {
1908                 if (h->nal_ref_idc &&
1909                         s0->current_picture_ptr->reference &&
1910                         s0->current_picture_ptr->frame_num != h->frame_num) {
1911                     /*
1912                      * This and previous field were reference, but had
1913                      * different frame_nums. Consider this field first in
1914                      * pair. Throw away previous field except for reference
1915                      * purposes.
1916                      */
1917                     s0->first_field = 1;
1918                     s0->current_picture_ptr = NULL;
1919
1920                 } else {
1921                     /* Second field in complementary pair */
1922                     s0->first_field = 0;
1923                 }
1924             }
1925
1926         } else {
1927             /* Frame or first field in a potentially complementary pair */
1928             assert(!s0->current_picture_ptr);
1929             s0->first_field = FIELD_PICTURE;
1930         }
1931
1932         if((!FIELD_PICTURE || s0->first_field) && ff_h264_frame_start(h) < 0) {
1933             s0->first_field = 0;
1934             return -1;
1935         }
1936     }
1937     if(h != h0)
1938         clone_slice(h, h0);
1939
1940     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
1941
1942     assert(s->mb_num == s->mb_width * s->mb_height);
1943     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
1944        first_mb_in_slice                    >= s->mb_num){
1945         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
1946         return -1;
1947     }
1948     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
1949     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
1950     if (s->picture_structure == PICT_BOTTOM_FIELD)
1951         s->resync_mb_y = s->mb_y = s->mb_y + 1;
1952     assert(s->mb_y < s->mb_height);
1953
1954     if(s->picture_structure==PICT_FRAME){
1955         h->curr_pic_num=   h->frame_num;
1956         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
1957     }else{
1958         h->curr_pic_num= 2*h->frame_num + 1;
1959         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
1960     }
1961
1962     if(h->nal_unit_type == NAL_IDR_SLICE){
1963         get_ue_golomb(&s->gb); /* idr_pic_id */
1964     }
1965
1966     if(h->sps.poc_type==0){
1967         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
1968
1969         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
1970             h->delta_poc_bottom= get_se_golomb(&s->gb);
1971         }
1972     }
1973
1974     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
1975         h->delta_poc[0]= get_se_golomb(&s->gb);
1976
1977         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
1978             h->delta_poc[1]= get_se_golomb(&s->gb);
1979     }
1980
1981     init_poc(h);
1982
1983     if(h->pps.redundant_pic_cnt_present){
1984         h->redundant_pic_count= get_ue_golomb(&s->gb);
1985     }
1986
1987     //set defaults, might be overridden a few lines later
1988     h->ref_count[0]= h->pps.ref_count[0];
1989     h->ref_count[1]= h->pps.ref_count[1];
1990
1991     if(h->slice_type_nos != FF_I_TYPE){
1992         if(h->slice_type_nos == FF_B_TYPE){
1993             h->direct_spatial_mv_pred= get_bits1(&s->gb);
1994         }
1995         num_ref_idx_active_override_flag= get_bits1(&s->gb);
1996
1997         if(num_ref_idx_active_override_flag){
1998             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
1999             if(h->slice_type_nos==FF_B_TYPE)
2000                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
2001
2002             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
2003                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
2004                 h->ref_count[0]= h->ref_count[1]= 1;
2005                 return -1;
2006             }
2007         }
2008         if(h->slice_type_nos == FF_B_TYPE)
2009             h->list_count= 2;
2010         else
2011             h->list_count= 1;
2012     }else
2013         h->list_count= 0;
2014
2015     if(!default_ref_list_done){
2016         ff_h264_fill_default_ref_list(h);
2017     }
2018
2019     if(h->slice_type_nos!=FF_I_TYPE && ff_h264_decode_ref_pic_list_reordering(h) < 0)
2020         return -1;
2021
2022     if(h->slice_type_nos!=FF_I_TYPE){
2023         s->last_picture_ptr= &h->ref_list[0][0];
2024         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
2025     }
2026     if(h->slice_type_nos==FF_B_TYPE){
2027         s->next_picture_ptr= &h->ref_list[1][0];
2028         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
2029     }
2030
2031     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
2032        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
2033         pred_weight_table(h);
2034     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE){
2035         implicit_weight_table(h, -1);
2036     }else {
2037         h->use_weight = 0;
2038         for (i = 0; i < 2; i++) {
2039             h->luma_weight_flag[i]   = 0;
2040             h->chroma_weight_flag[i] = 0;
2041         }
2042     }
2043
2044     if(h->nal_ref_idc)
2045         ff_h264_decode_ref_pic_marking(h0, &s->gb);
2046
2047     if(FRAME_MBAFF){
2048         ff_h264_fill_mbaff_ref_list(h);
2049
2050         if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE){
2051             implicit_weight_table(h, 0);
2052             implicit_weight_table(h, 1);
2053         }
2054     }
2055
2056     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
2057         ff_h264_direct_dist_scale_factor(h);
2058     ff_h264_direct_ref_list_init(h);
2059
2060     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
2061         tmp = get_ue_golomb_31(&s->gb);
2062         if(tmp > 2){
2063             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
2064             return -1;
2065         }
2066         h->cabac_init_idc= tmp;
2067     }
2068
2069     h->last_qscale_diff = 0;
2070     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
2071     if(tmp>51){
2072         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
2073         return -1;
2074     }
2075     s->qscale= tmp;
2076     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
2077     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
2078     //FIXME qscale / qp ... stuff
2079     if(h->slice_type == FF_SP_TYPE){
2080         get_bits1(&s->gb); /* sp_for_switch_flag */
2081     }
2082     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
2083         get_se_golomb(&s->gb); /* slice_qs_delta */
2084     }
2085
2086     h->deblocking_filter = 1;
2087     h->slice_alpha_c0_offset = 52;
2088     h->slice_beta_offset = 52;
2089     if( h->pps.deblocking_filter_parameters_present ) {
2090         tmp= get_ue_golomb_31(&s->gb);
2091         if(tmp > 2){
2092             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
2093             return -1;
2094         }
2095         h->deblocking_filter= tmp;
2096         if(h->deblocking_filter < 2)
2097             h->deblocking_filter^= 1; // 1<->0
2098
2099         if( h->deblocking_filter ) {
2100             h->slice_alpha_c0_offset += get_se_golomb(&s->gb) << 1;
2101             h->slice_beta_offset     += get_se_golomb(&s->gb) << 1;
2102             if(   h->slice_alpha_c0_offset > 104U
2103                || h->slice_beta_offset     > 104U){
2104                 av_log(s->avctx, AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", h->slice_alpha_c0_offset, h->slice_beta_offset);
2105                 return -1;
2106             }
2107         }
2108     }
2109
2110     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
2111        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
2112        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
2113        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
2114         h->deblocking_filter= 0;
2115
2116     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
2117         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
2118             /* Cheat slightly for speed:
2119                Do not bother to deblock across slices. */
2120             h->deblocking_filter = 2;
2121         } else {
2122             h0->max_contexts = 1;
2123             if(!h0->single_decode_warning) {
2124                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
2125                 h0->single_decode_warning = 1;
2126             }
2127             if(h != h0)
2128                 return 1; // deblocking switched inside frame
2129         }
2130     }
2131     h->qp_thresh= 15 + 52 - FFMIN(h->slice_alpha_c0_offset, h->slice_beta_offset) - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
2132
2133 #if 0 //FMO
2134     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
2135         slice_group_change_cycle= get_bits(&s->gb, ?);
2136 #endif
2137
2138     h0->last_slice_type = slice_type;
2139     h->slice_num = ++h0->current_slice;
2140     if(h->slice_num >= MAX_SLICES){
2141         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
2142     }
2143
2144     for(j=0; j<2; j++){
2145         int id_list[16];
2146         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
2147         for(i=0; i<16; i++){
2148             id_list[i]= 60;
2149             if(h->ref_list[j][i].data[0]){
2150                 int k;
2151                 uint8_t *base= h->ref_list[j][i].base[0];
2152                 for(k=0; k<h->short_ref_count; k++)
2153                     if(h->short_ref[k]->base[0] == base){
2154                         id_list[i]= k;
2155                         break;
2156                     }
2157                 for(k=0; k<h->long_ref_count; k++)
2158                     if(h->long_ref[k] && h->long_ref[k]->base[0] == base){
2159                         id_list[i]= h->short_ref_count + k;
2160                         break;
2161                     }
2162             }
2163         }
2164
2165         ref2frm[0]=
2166         ref2frm[1]= -1;
2167         for(i=0; i<16; i++)
2168             ref2frm[i+2]= 4*id_list[i]
2169                           +(h->ref_list[j][i].reference&3);
2170         ref2frm[18+0]=
2171         ref2frm[18+1]= -1;
2172         for(i=16; i<48; i++)
2173             ref2frm[i+4]= 4*id_list[(i-16)>>1]
2174                           +(h->ref_list[j][i].reference&3);
2175     }
2176
2177     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
2178     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
2179
2180     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
2181         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
2182                h->slice_num,
2183                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
2184                first_mb_in_slice,
2185                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
2186                pps_id, h->frame_num,
2187                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
2188                h->ref_count[0], h->ref_count[1],
2189                s->qscale,
2190                h->deblocking_filter, h->slice_alpha_c0_offset/2-26, h->slice_beta_offset/2-26,
2191                h->use_weight,
2192                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
2193                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
2194                );
2195     }
2196
2197     return 0;
2198 }
2199
2200 int ff_h264_get_slice_type(const H264Context *h)
2201 {
2202     switch (h->slice_type) {
2203     case FF_P_TYPE:  return 0;
2204     case FF_B_TYPE:  return 1;
2205     case FF_I_TYPE:  return 2;
2206     case FF_SP_TYPE: return 3;
2207     case FF_SI_TYPE: return 4;
2208     default:         return -1;
2209     }
2210 }
2211
2212 /**
2213  *
2214  * @return non zero if the loop filter can be skiped
2215  */
2216 static int fill_filter_caches(H264Context *h, int mb_type){
2217     MpegEncContext * const s = &h->s;
2218     const int mb_xy= h->mb_xy;
2219     int top_xy, left_xy[2];
2220     int top_type, left_type[2];
2221
2222     top_xy     = mb_xy  - (s->mb_stride << MB_FIELD);
2223
2224     //FIXME deblocking could skip the intra and nnz parts.
2225
2226     /* Wow, what a mess, why didn't they simplify the interlacing & intra
2227      * stuff, I can't imagine that these complex rules are worth it. */
2228
2229     left_xy[1] = left_xy[0] = mb_xy-1;
2230     if(FRAME_MBAFF){
2231         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]);
2232         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
2233         if(s->mb_y&1){
2234             if (left_mb_field_flag != curr_mb_field_flag) {
2235                 left_xy[0] -= s->mb_stride;
2236             }
2237         }else{
2238             if(curr_mb_field_flag){
2239                 top_xy      += s->mb_stride & (((s->current_picture.mb_type[top_xy    ]>>7)&1)-1);
2240             }
2241             if (left_mb_field_flag != curr_mb_field_flag) {
2242                 left_xy[1] += s->mb_stride;
2243             }
2244         }
2245     }
2246
2247     h->top_mb_xy = top_xy;
2248     h->left_mb_xy[0] = left_xy[0];
2249     h->left_mb_xy[1] = left_xy[1];
2250     {
2251         //for sufficiently low qp, filtering wouldn't do anything
2252         //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
2253         int qp_thresh = h->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
2254         int qp = s->current_picture.qscale_table[mb_xy];
2255         if(qp <= qp_thresh
2256            && (left_xy[0]<0 || ((qp + s->current_picture.qscale_table[left_xy[0]] + 1)>>1) <= qp_thresh)
2257            && (top_xy   < 0 || ((qp + s->current_picture.qscale_table[top_xy    ] + 1)>>1) <= qp_thresh)){
2258             if(!FRAME_MBAFF)
2259                 return 1;
2260             if(   (left_xy[0]< 0            || ((qp + s->current_picture.qscale_table[left_xy[1]             ] + 1)>>1) <= qp_thresh)
2261                && (top_xy    < s->mb_stride || ((qp + s->current_picture.qscale_table[top_xy    -s->mb_stride] + 1)>>1) <= qp_thresh))
2262                 return 1;
2263         }
2264     }
2265
2266     top_type     = s->current_picture.mb_type[top_xy]    ;
2267     left_type[0] = s->current_picture.mb_type[left_xy[0]];
2268     left_type[1] = s->current_picture.mb_type[left_xy[1]];
2269     if(h->deblocking_filter == 2){
2270         if(h->slice_table[top_xy     ] != h->slice_num) top_type= 0;
2271         if(h->slice_table[left_xy[0] ] != h->slice_num) left_type[0]= left_type[1]= 0;
2272     }else{
2273         if(h->slice_table[top_xy     ] == 0xFFFF) top_type= 0;
2274         if(h->slice_table[left_xy[0] ] == 0xFFFF) left_type[0]= left_type[1] =0;
2275     }
2276     h->top_type    = top_type    ;
2277     h->left_type[0]= left_type[0];
2278     h->left_type[1]= left_type[1];
2279
2280     if(IS_INTRA(mb_type))
2281         return 0;
2282
2283     AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]);
2284     AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]);
2285     AV_COPY32(&h->non_zero_count_cache[0+8*5], &h->non_zero_count[mb_xy][16]);
2286     AV_COPY32(&h->non_zero_count_cache[4+8*3], &h->non_zero_count[mb_xy][20]);
2287     AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]);
2288
2289     h->cbp= h->cbp_table[mb_xy];
2290
2291     {
2292         int list;
2293         for(list=0; list<h->list_count; list++){
2294             int8_t *ref;
2295             int y, b_stride;
2296             int16_t (*mv_dst)[2];
2297             int16_t (*mv_src)[2];
2298
2299             if(!USES_LIST(mb_type, list)){
2300                 fill_rectangle(  h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
2301                 AV_WN32A(&h->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
2302                 AV_WN32A(&h->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
2303                 AV_WN32A(&h->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
2304                 AV_WN32A(&h->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
2305                 continue;
2306             }
2307
2308             ref = &s->current_picture.ref_index[list][4*mb_xy];
2309             {
2310                 int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
2311                 AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
2312                 AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
2313                 ref += 2;
2314                 AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
2315                 AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
2316             }
2317
2318             b_stride = h->b_stride;
2319             mv_dst   = &h->mv_cache[list][scan8[0]];
2320             mv_src   = &s->current_picture.motion_val[list][4*s->mb_x + 4*s->mb_y*b_stride];
2321             for(y=0; y<4; y++){
2322                 AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride);
2323             }
2324
2325         }
2326     }
2327
2328
2329 /*
2330 0 . T T. T T T T
2331 1 L . .L . . . .
2332 2 L . .L . . . .
2333 3 . T TL . . . .
2334 4 L . .L . . . .
2335 5 L . .. . . . .
2336 */
2337 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
2338     if(top_type){
2339         AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
2340     }
2341
2342     if(left_type[0]){
2343         h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][7+0*8];
2344         h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][7+1*8];
2345         h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][7+2*8];
2346         h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][7+3*8];
2347     }
2348
2349     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
2350     if(!CABAC && h->pps.transform_8x8_mode){
2351         if(IS_8x8DCT(top_type)){
2352             h->non_zero_count_cache[4+8*0]=
2353             h->non_zero_count_cache[5+8*0]= h->cbp_table[top_xy] & 4;
2354             h->non_zero_count_cache[6+8*0]=
2355             h->non_zero_count_cache[7+8*0]= h->cbp_table[top_xy] & 8;
2356         }
2357         if(IS_8x8DCT(left_type[0])){
2358             h->non_zero_count_cache[3+8*1]=
2359             h->non_zero_count_cache[3+8*2]= h->cbp_table[left_xy[0]]&2; //FIXME check MBAFF
2360         }
2361         if(IS_8x8DCT(left_type[1])){
2362             h->non_zero_count_cache[3+8*3]=
2363             h->non_zero_count_cache[3+8*4]= h->cbp_table[left_xy[1]]&8; //FIXME check MBAFF
2364         }
2365
2366         if(IS_8x8DCT(mb_type)){
2367             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
2368             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
2369
2370             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
2371             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
2372
2373             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
2374             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
2375
2376             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
2377             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
2378         }
2379     }
2380
2381     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
2382         int list;
2383         for(list=0; list<h->list_count; list++){
2384             if(USES_LIST(top_type, list)){
2385                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
2386                 const int b8_xy= 4*top_xy + 2;
2387                 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
2388                 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
2389                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
2390                 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]];
2391                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
2392                 h->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 1]];
2393             }else{
2394                 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]);
2395                 AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
2396             }
2397
2398             if(!IS_INTERLACED(mb_type^left_type[0])){
2399                 if(USES_LIST(left_type[0], list)){
2400                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
2401                     const int b8_xy= 4*left_xy[0] + 1;
2402                     int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
2403                     AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]);
2404                     AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]);
2405                     AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]);
2406                     AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]);
2407                     h->ref_cache[list][scan8[0] - 1 + 0 ]=
2408                     h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*0]];
2409                     h->ref_cache[list][scan8[0] - 1 +16 ]=
2410                     h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*1]];
2411                 }else{
2412                     AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]);
2413                     AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]);
2414                     AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +16 ]);
2415                     AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +24 ]);
2416                     h->ref_cache[list][scan8[0] - 1 + 0  ]=
2417                     h->ref_cache[list][scan8[0] - 1 + 8  ]=
2418                     h->ref_cache[list][scan8[0] - 1 + 16 ]=
2419                     h->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
2420                 }
2421             }
2422         }
2423     }
2424
2425     return 0;
2426 }
2427
2428 static void loop_filter(H264Context *h){
2429     MpegEncContext * const s = &h->s;
2430     uint8_t  *dest_y, *dest_cb, *dest_cr;
2431     int linesize, uvlinesize, mb_x, mb_y;
2432     const int end_mb_y= s->mb_y + FRAME_MBAFF;
2433     const int old_slice_type= h->slice_type;
2434
2435     if(h->deblocking_filter) {
2436         for(mb_x= 0; mb_x<s->mb_width; mb_x++){
2437             for(mb_y=end_mb_y - FRAME_MBAFF; mb_y<= end_mb_y; mb_y++){
2438                 int mb_xy, mb_type;
2439                 mb_xy = h->mb_xy = mb_x + mb_y*s->mb_stride;
2440                 h->slice_num= h->slice_table[mb_xy];
2441                 mb_type= s->current_picture.mb_type[mb_xy];
2442                 h->list_count= h->list_counts[mb_xy];
2443
2444                 if(FRAME_MBAFF)
2445                     h->mb_mbaff = h->mb_field_decoding_flag = !!IS_INTERLACED(mb_type);
2446
2447                 s->mb_x= mb_x;
2448                 s->mb_y= mb_y;
2449                 dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2450                 dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2451                 dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2452                     //FIXME simplify above
2453
2454                 if (MB_FIELD) {
2455                     linesize   = h->mb_linesize   = s->linesize * 2;
2456                     uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2457                     if(mb_y&1){ //FIXME move out of this function?
2458                         dest_y -= s->linesize*15;
2459                         dest_cb-= s->uvlinesize*7;
2460                         dest_cr-= s->uvlinesize*7;
2461                     }
2462                 } else {
2463                     linesize   = h->mb_linesize   = s->linesize;
2464                     uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2465                 }
2466                 backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
2467                 if(fill_filter_caches(h, mb_type))
2468                     continue;
2469                 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2470                 h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2471
2472                 if (FRAME_MBAFF) {
2473                     ff_h264_filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2474                 } else {
2475                     ff_h264_filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2476                 }
2477             }
2478         }
2479     }
2480     h->slice_type= old_slice_type;
2481     s->mb_x= 0;
2482     s->mb_y= end_mb_y - FRAME_MBAFF;
2483     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
2484     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
2485 }
2486
2487 static void predict_field_decoding_flag(H264Context *h){
2488     MpegEncContext * const s = &h->s;
2489     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2490     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
2491                 ? s->current_picture.mb_type[mb_xy-1]
2492                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
2493                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
2494                 : 0;
2495     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
2496 }
2497
2498 static int decode_slice(struct AVCodecContext *avctx, void *arg){
2499     H264Context *h = *(void**)arg;
2500     MpegEncContext * const s = &h->s;
2501     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
2502
2503     s->mb_skip_run= -1;
2504
2505     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
2506                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
2507
2508     if( h->pps.cabac ) {
2509         /* realign */
2510         align_get_bits( &s->gb );
2511
2512         /* init cabac */
2513         ff_init_cabac_states( &h->cabac);
2514         ff_init_cabac_decoder( &h->cabac,
2515                                s->gb.buffer + get_bits_count(&s->gb)/8,
2516                                (get_bits_left(&s->gb) + 7)/8);
2517
2518         ff_h264_init_cabac_states(h);
2519
2520         for(;;){
2521 //START_TIMER
2522             int ret = ff_h264_decode_mb_cabac(h);
2523             int eos;
2524 //STOP_TIMER("decode_mb_cabac")
2525
2526             if(ret>=0) ff_h264_hl_decode_mb(h);
2527
2528             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
2529                 s->mb_y++;
2530
2531                 ret = ff_h264_decode_mb_cabac(h);
2532
2533                 if(ret>=0) ff_h264_hl_decode_mb(h);
2534                 s->mb_y--;
2535             }
2536             eos = get_cabac_terminate( &h->cabac );
2537
2538             if((s->workaround_bugs & FF_BUG_TRUNCATED) && h->cabac.bytestream > h->cabac.bytestream_end + 2){
2539                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2540                 return 0;
2541             }
2542             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
2543                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
2544                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
2545                 return -1;
2546             }
2547
2548             if( ++s->mb_x >= s->mb_width ) {
2549                 s->mb_x = 0;
2550                 loop_filter(h);
2551                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
2552                 ++s->mb_y;
2553                 if(FIELD_OR_MBAFF_PICTURE) {
2554                     ++s->mb_y;
2555                     if(FRAME_MBAFF && s->mb_y < s->mb_height)
2556                         predict_field_decoding_flag(h);
2557                 }
2558             }
2559
2560             if( eos || s->mb_y >= s->mb_height ) {
2561                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
2562                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2563                 return 0;
2564             }
2565         }
2566
2567     } else {
2568         for(;;){
2569             int ret = ff_h264_decode_mb_cavlc(h);
2570
2571             if(ret>=0) ff_h264_hl_decode_mb(h);
2572
2573             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
2574                 s->mb_y++;
2575                 ret = ff_h264_decode_mb_cavlc(h);
2576
2577                 if(ret>=0) ff_h264_hl_decode_mb(h);
2578                 s->mb_y--;
2579             }
2580
2581             if(ret<0){
2582                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
2583                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
2584
2585                 return -1;
2586             }
2587
2588             if(++s->mb_x >= s->mb_width){
2589                 s->mb_x=0;
2590                 loop_filter(h);
2591                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
2592                 ++s->mb_y;
2593                 if(FIELD_OR_MBAFF_PICTURE) {
2594                     ++s->mb_y;
2595                     if(FRAME_MBAFF && s->mb_y < s->mb_height)
2596                         predict_field_decoding_flag(h);
2597                 }
2598                 if(s->mb_y >= s->mb_height){
2599                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
2600
2601                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
2602                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2603
2604                         return 0;
2605                     }else{
2606                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2607
2608                         return -1;
2609                     }
2610                 }
2611             }
2612
2613             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
2614                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
2615                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
2616                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2617
2618                     return 0;
2619                 }else{
2620                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
2621
2622                     return -1;
2623                 }
2624             }
2625         }
2626     }
2627
2628 #if 0
2629     for(;s->mb_y < s->mb_height; s->mb_y++){
2630         for(;s->mb_x < s->mb_width; s->mb_x++){
2631             int ret= decode_mb(h);
2632
2633             ff_h264_hl_decode_mb(h);
2634
2635             if(ret<0){
2636                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
2637                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
2638
2639                 return -1;
2640             }
2641
2642             if(++s->mb_x >= s->mb_width){
2643                 s->mb_x=0;
2644                 if(++s->mb_y >= s->mb_height){
2645                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
2646                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2647
2648                         return 0;
2649                     }else{
2650                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2651
2652                         return -1;
2653                     }
2654                 }
2655             }
2656
2657             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
2658                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
2659                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
2660
2661                     return 0;
2662                 }else{
2663                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
2664
2665                     return -1;
2666                 }
2667             }
2668         }
2669         s->mb_x=0;
2670         ff_draw_horiz_band(s, 16*s->mb_y, 16);
2671     }
2672 #endif
2673     return -1; //not reached
2674 }
2675
2676 /**
2677  * Call decode_slice() for each context.
2678  *
2679  * @param h h264 master context
2680  * @param context_count number of contexts to execute
2681  */
2682 static void execute_decode_slices(H264Context *h, int context_count){
2683     MpegEncContext * const s = &h->s;
2684     AVCodecContext * const avctx= s->avctx;
2685     H264Context *hx;
2686     int i;
2687
2688     if (s->avctx->hwaccel)
2689         return;
2690     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2691         return;
2692     if(context_count == 1) {
2693         decode_slice(avctx, &h);
2694     } else {
2695         for(i = 1; i < context_count; i++) {
2696             hx = h->thread_context[i];
2697             hx->s.error_recognition = avctx->error_recognition;
2698             hx->s.error_count = 0;
2699         }
2700
2701         avctx->execute(avctx, (void *)decode_slice,
2702                        h->thread_context, NULL, context_count, sizeof(void*));
2703
2704         /* pull back stuff from slices to master context */
2705         hx = h->thread_context[context_count - 1];
2706         s->mb_x = hx->s.mb_x;
2707         s->mb_y = hx->s.mb_y;
2708         s->dropable = hx->s.dropable;
2709         s->picture_structure = hx->s.picture_structure;
2710         for(i = 1; i < context_count; i++)
2711             h->s.error_count += h->thread_context[i]->s.error_count;
2712     }
2713 }
2714
2715
2716 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
2717     MpegEncContext * const s = &h->s;
2718     AVCodecContext * const avctx= s->avctx;
2719     int buf_index=0;
2720     H264Context *hx; ///< thread context
2721     int context_count = 0;
2722     int next_avc= h->is_avc ? 0 : buf_size;
2723
2724     h->max_contexts = avctx->thread_count;
2725 #if 0
2726     int i;
2727     for(i=0; i<50; i++){
2728         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
2729     }
2730 #endif
2731     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
2732         h->current_slice = 0;
2733         if (!s->first_field)
2734             s->current_picture_ptr= NULL;
2735         ff_h264_reset_sei(h);
2736     }
2737
2738     for(;;){
2739         int consumed;
2740         int dst_length;
2741         int bit_length;
2742         const uint8_t *ptr;
2743         int i, nalsize = 0;
2744         int err;
2745
2746         if(buf_index >= next_avc) {
2747             if(buf_index >= buf_size) break;
2748             nalsize = 0;
2749             for(i = 0; i < h->nal_length_size; i++)
2750                 nalsize = (nalsize << 8) | buf[buf_index++];
2751             if(nalsize <= 0 || nalsize > buf_size - buf_index){
2752                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
2753                 break;
2754             }
2755             next_avc= buf_index + nalsize;
2756         } else {
2757             // start code prefix search
2758             for(; buf_index + 3 < next_avc; buf_index++){
2759                 // This should always succeed in the first iteration.
2760                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
2761                     break;
2762             }
2763
2764             if(buf_index+3 >= buf_size) break;
2765
2766             buf_index+=3;
2767             if(buf_index >= next_avc) continue;
2768         }
2769
2770         hx = h->thread_context[context_count];
2771
2772         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
2773         if (ptr==NULL || dst_length < 0){
2774             return -1;
2775         }
2776         i= buf_index + consumed;
2777         if((s->workaround_bugs & FF_BUG_AUTODETECT) && i+3<next_avc &&
2778            buf[i]==0x00 && buf[i+1]==0x00 && buf[i+2]==0x01 && buf[i+3]==0xE0)
2779             s->workaround_bugs |= FF_BUG_TRUNCATED;
2780
2781         if(!(s->workaround_bugs & FF_BUG_TRUNCATED)){
2782         while(ptr[dst_length - 1] == 0 && dst_length > 0)
2783             dst_length--;
2784         }
2785         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
2786
2787         if(s->avctx->debug&FF_DEBUG_STARTCODE){
2788             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
2789         }
2790
2791         if (h->is_avc && (nalsize != consumed) && nalsize){
2792             av_log(h->s.avctx, AV_LOG_DEBUG, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
2793         }
2794
2795         buf_index += consumed;
2796
2797         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
2798            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
2799             continue;
2800
2801       again:
2802         err = 0;
2803         switch(hx->nal_unit_type){
2804         case NAL_IDR_SLICE:
2805             if (h->nal_unit_type != NAL_IDR_SLICE) {
2806                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
2807                 return -1;
2808             }
2809             idr(h); //FIXME ensure we don't loose some frames if there is reordering
2810         case NAL_SLICE:
2811             init_get_bits(&hx->s.gb, ptr, bit_length);
2812             hx->intra_gb_ptr=
2813             hx->inter_gb_ptr= &hx->s.gb;
2814             hx->s.data_partitioning = 0;
2815
2816             if((err = decode_slice_header(hx, h)))
2817                break;
2818
2819             if (h->current_slice == 1) {
2820                 if (s->avctx->hwaccel && s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
2821                     return -1;
2822                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2823                     ff_vdpau_h264_picture_start(s);
2824             }
2825
2826             s->current_picture_ptr->key_frame |=
2827                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
2828                     (h->sei_recovery_frame_cnt >= 0);
2829             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
2830                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
2831                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
2832                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
2833                && avctx->skip_frame < AVDISCARD_ALL){
2834                 if(avctx->hwaccel) {
2835                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
2836                         return -1;
2837                 }else
2838                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
2839                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
2840                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
2841                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
2842                 }else
2843                     context_count++;
2844             }
2845             break;
2846         case NAL_DPA:
2847             init_get_bits(&hx->s.gb, ptr, bit_length);
2848             hx->intra_gb_ptr=
2849             hx->inter_gb_ptr= NULL;
2850
2851             if ((err = decode_slice_header(hx, h)) < 0)
2852                 break;
2853
2854             hx->s.data_partitioning = 1;
2855
2856             break;
2857         case NAL_DPB:
2858             init_get_bits(&hx->intra_gb, ptr, bit_length);
2859             hx->intra_gb_ptr= &hx->intra_gb;
2860             break;
2861         case NAL_DPC:
2862             init_get_bits(&hx->inter_gb, ptr, bit_length);
2863             hx->inter_gb_ptr= &hx->inter_gb;
2864
2865             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
2866                && s->context_initialized
2867                && s->hurry_up < 5
2868                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
2869                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
2870                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
2871                && avctx->skip_frame < AVDISCARD_ALL)
2872                 context_count++;
2873             break;
2874         case NAL_SEI:
2875             init_get_bits(&s->gb, ptr, bit_length);
2876             ff_h264_decode_sei(h);
2877             break;
2878         case NAL_SPS:
2879             init_get_bits(&s->gb, ptr, bit_length);
2880             ff_h264_decode_seq_parameter_set(h);
2881
2882             if(s->flags& CODEC_FLAG_LOW_DELAY)
2883                 s->low_delay=1;
2884
2885             if(avctx->has_b_frames < 2)
2886                 avctx->has_b_frames= !s->low_delay;
2887             break;
2888         case NAL_PPS:
2889             init_get_bits(&s->gb, ptr, bit_length);
2890
2891             ff_h264_decode_picture_parameter_set(h, bit_length);
2892
2893             break;
2894         case NAL_AUD:
2895         case NAL_END_SEQUENCE:
2896         case NAL_END_STREAM:
2897         case NAL_FILLER_DATA:
2898         case NAL_SPS_EXT:
2899         case NAL_AUXILIARY_SLICE:
2900             break;
2901         default:
2902             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", hx->nal_unit_type, bit_length);
2903         }
2904
2905         if(context_count == h->max_contexts) {
2906             execute_decode_slices(h, context_count);
2907             context_count = 0;
2908         }
2909
2910         if (err < 0)
2911             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
2912         else if(err == 1) {
2913             /* Slice could not be decoded in parallel mode, copy down
2914              * NAL unit stuff to context 0 and restart. Note that
2915              * rbsp_buffer is not transferred, but since we no longer
2916              * run in parallel mode this should not be an issue. */
2917             h->nal_unit_type = hx->nal_unit_type;
2918             h->nal_ref_idc   = hx->nal_ref_idc;
2919             hx = h;
2920             goto again;
2921         }
2922     }
2923     if(context_count)
2924         execute_decode_slices(h, context_count);
2925     return buf_index;
2926 }
2927
2928 /**
2929  * returns the number of bytes consumed for building the current frame
2930  */
2931 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
2932         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
2933         if(pos+10>buf_size) pos=buf_size; // oops ;)
2934
2935         return pos;
2936 }
2937
2938 static int decode_frame(AVCodecContext *avctx,
2939                              void *data, int *data_size,
2940                              AVPacket *avpkt)
2941 {
2942     const uint8_t *buf = avpkt->data;
2943     int buf_size = avpkt->size;
2944     H264Context *h = avctx->priv_data;
2945     MpegEncContext *s = &h->s;
2946     AVFrame *pict = data;
2947     int buf_index;
2948
2949     s->flags= avctx->flags;
2950     s->flags2= avctx->flags2;
2951
2952    /* end of stream, output what is still in the buffers */
2953  out:
2954     if (buf_size == 0) {
2955         Picture *out;
2956         int i, out_idx;
2957
2958 //FIXME factorize this with the output code below
2959         out = h->delayed_pic[0];
2960         out_idx = 0;
2961         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
2962             if(h->delayed_pic[i]->poc < out->poc){
2963                 out = h->delayed_pic[i];
2964                 out_idx = i;
2965             }
2966
2967         for(i=out_idx; h->delayed_pic[i]; i++)
2968             h->delayed_pic[i] = h->delayed_pic[i+1];
2969
2970         if(out){
2971             *data_size = sizeof(AVFrame);
2972             *pict= *(AVFrame*)out;
2973         }
2974
2975         return 0;
2976     }
2977
2978     buf_index=decode_nal_units(h, buf, buf_size);
2979     if(buf_index < 0)
2980         return -1;
2981
2982     if (!s->current_picture_ptr && h->nal_unit_type == NAL_END_SEQUENCE) {
2983         buf_size = 0;
2984         goto out;
2985     }
2986
2987     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
2988         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
2989         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
2990         return -1;
2991     }
2992
2993     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
2994         Picture *out = s->current_picture_ptr;
2995         Picture *cur = s->current_picture_ptr;
2996         int i, pics, out_of_order, out_idx;
2997
2998         field_end(h);
2999
3000         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
3001             /* Wait for second field. */
3002             *data_size = 0;
3003
3004         } else {
3005             cur->interlaced_frame = 0;
3006             cur->repeat_pict = 0;
3007
3008             /* Signal interlacing information externally. */
3009             /* Prioritize picture timing SEI information over used decoding process if it exists. */
3010
3011             if(h->sps.pic_struct_present_flag){
3012                 switch (h->sei_pic_struct)
3013                 {
3014                 case SEI_PIC_STRUCT_FRAME:
3015                     break;
3016                 case SEI_PIC_STRUCT_TOP_FIELD:
3017                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
3018                     cur->interlaced_frame = 1;
3019                     break;
3020                 case SEI_PIC_STRUCT_TOP_BOTTOM:
3021                 case SEI_PIC_STRUCT_BOTTOM_TOP:
3022                     if (FIELD_OR_MBAFF_PICTURE)
3023                         cur->interlaced_frame = 1;
3024                     else
3025                         // try to flag soft telecine progressive
3026                         cur->interlaced_frame = h->prev_interlaced_frame;
3027                     break;
3028                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
3029                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
3030                     // Signal the possibility of telecined film externally (pic_struct 5,6)
3031                     // From these hints, let the applications decide if they apply deinterlacing.
3032                     cur->repeat_pict = 1;
3033                     break;
3034                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
3035                     // Force progressive here, as doubling interlaced frame is a bad idea.
3036                     cur->repeat_pict = 2;
3037                     break;
3038                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
3039                     cur->repeat_pict = 4;
3040                     break;
3041                 }
3042
3043                 if ((h->sei_ct_type & 3) && h->sei_pic_struct <= SEI_PIC_STRUCT_BOTTOM_TOP)
3044                     cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
3045             }else{
3046                 /* Derive interlacing flag from used decoding process. */
3047                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
3048             }
3049             h->prev_interlaced_frame = cur->interlaced_frame;
3050
3051             if (cur->field_poc[0] != cur->field_poc[1]){
3052                 /* Derive top_field_first from field pocs. */
3053                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
3054             }else{
3055                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
3056                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
3057                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
3058                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
3059                         cur->top_field_first = 1;
3060                     else
3061                         cur->top_field_first = 0;
3062                 }else{
3063                     /* Most likely progressive */
3064                     cur->top_field_first = 0;
3065                 }
3066             }
3067
3068         //FIXME do something with unavailable reference frames
3069
3070             /* Sort B-frames into display order */
3071
3072             if(h->sps.bitstream_restriction_flag
3073                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
3074                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
3075                 s->low_delay = 0;
3076             }
3077
3078             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
3079                && !h->sps.bitstream_restriction_flag){
3080                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
3081                 s->low_delay= 0;
3082             }
3083
3084             pics = 0;
3085             while(h->delayed_pic[pics]) pics++;
3086
3087             assert(pics <= MAX_DELAYED_PIC_COUNT);
3088
3089             h->delayed_pic[pics++] = cur;
3090             if(cur->reference == 0)
3091                 cur->reference = DELAYED_PIC_REF;
3092
3093             out = h->delayed_pic[0];
3094             out_idx = 0;
3095             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
3096                 if(h->delayed_pic[i]->poc < out->poc){
3097                     out = h->delayed_pic[i];
3098                     out_idx = i;
3099                 }
3100             if(s->avctx->has_b_frames == 0 && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset))
3101                 h->outputed_poc= INT_MIN;
3102             out_of_order = out->poc < h->outputed_poc;
3103
3104             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
3105                 { }
3106             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
3107                || (s->low_delay &&
3108                 ((h->outputed_poc != INT_MIN && out->poc > h->outputed_poc + 2)
3109                  || cur->pict_type == FF_B_TYPE)))
3110             {
3111                 s->low_delay = 0;
3112                 s->avctx->has_b_frames++;
3113             }
3114
3115             if(out_of_order || pics > s->avctx->has_b_frames){
3116                 out->reference &= ~DELAYED_PIC_REF;
3117                 for(i=out_idx; h->delayed_pic[i]; i++)
3118                     h->delayed_pic[i] = h->delayed_pic[i+1];
3119             }
3120             if(!out_of_order && pics > s->avctx->has_b_frames){
3121                 *data_size = sizeof(AVFrame);
3122
3123                 if(out_idx==0 && h->delayed_pic[0] && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset)) {
3124                     h->outputed_poc = INT_MIN;
3125                 } else
3126                     h->outputed_poc = out->poc;
3127                 *pict= *(AVFrame*)out;
3128             }else{
3129                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
3130             }
3131         }
3132     }
3133
3134     assert(pict->data[0] || !*data_size);
3135     ff_print_debug_info(s, pict);
3136 //printf("out %d\n", (int)pict->data[0]);
3137
3138     return get_consumed_bytes(s, buf_index, buf_size);
3139 }
3140 #if 0
3141 static inline void fill_mb_avail(H264Context *h){
3142     MpegEncContext * const s = &h->s;
3143     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
3144
3145     if(s->mb_y){
3146         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
3147         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
3148         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
3149     }else{
3150         h->mb_avail[0]=
3151         h->mb_avail[1]=
3152         h->mb_avail[2]= 0;
3153     }
3154     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
3155     h->mb_avail[4]= 1; //FIXME move out
3156     h->mb_avail[5]= 0; //FIXME move out
3157 }
3158 #endif
3159
3160 #ifdef TEST
3161 #undef printf
3162 #undef random
3163 #define COUNT 8000
3164 #define SIZE (COUNT*40)
3165 int main(void){
3166     int i;
3167     uint8_t temp[SIZE];
3168     PutBitContext pb;
3169     GetBitContext gb;
3170 //    int int_temp[10000];
3171     DSPContext dsp;
3172     AVCodecContext avctx;
3173
3174     dsputil_init(&dsp, &avctx);
3175
3176     init_put_bits(&pb, temp, SIZE);
3177     printf("testing unsigned exp golomb\n");
3178     for(i=0; i<COUNT; i++){
3179         START_TIMER
3180         set_ue_golomb(&pb, i);
3181         STOP_TIMER("set_ue_golomb");
3182     }
3183     flush_put_bits(&pb);
3184
3185     init_get_bits(&gb, temp, 8*SIZE);
3186     for(i=0; i<COUNT; i++){
3187         int j, s;
3188
3189         s= show_bits(&gb, 24);
3190
3191         START_TIMER
3192         j= get_ue_golomb(&gb);
3193         if(j != i){
3194             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
3195 //            return -1;
3196         }
3197         STOP_TIMER("get_ue_golomb");
3198     }
3199
3200
3201     init_put_bits(&pb, temp, SIZE);
3202     printf("testing signed exp golomb\n");
3203     for(i=0; i<COUNT; i++){
3204         START_TIMER
3205         set_se_golomb(&pb, i - COUNT/2);
3206         STOP_TIMER("set_se_golomb");
3207     }
3208     flush_put_bits(&pb);
3209
3210     init_get_bits(&gb, temp, 8*SIZE);
3211     for(i=0; i<COUNT; i++){
3212         int j, s;
3213
3214         s= show_bits(&gb, 24);
3215
3216         START_TIMER
3217         j= get_se_golomb(&gb);
3218         if(j != i - COUNT/2){
3219             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
3220 //            return -1;
3221         }
3222         STOP_TIMER("get_se_golomb");
3223     }
3224
3225 #if 0
3226     printf("testing 4x4 (I)DCT\n");
3227
3228     DCTELEM block[16];
3229     uint8_t src[16], ref[16];
3230     uint64_t error= 0, max_error=0;
3231
3232     for(i=0; i<COUNT; i++){
3233         int j;
3234 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
3235         for(j=0; j<16; j++){
3236             ref[j]= random()%255;
3237             src[j]= random()%255;
3238         }
3239
3240         h264_diff_dct_c(block, src, ref, 4);
3241
3242         //normalize
3243         for(j=0; j<16; j++){
3244 //            printf("%d ", block[j]);
3245             block[j]= block[j]*4;
3246             if(j&1) block[j]= (block[j]*4 + 2)/5;
3247             if(j&4) block[j]= (block[j]*4 + 2)/5;
3248         }
3249 //        printf("\n");
3250
3251         h->h264dsp.h264_idct_add(ref, block, 4);
3252 /*        for(j=0; j<16; j++){
3253             printf("%d ", ref[j]);
3254         }
3255         printf("\n");*/
3256
3257         for(j=0; j<16; j++){
3258             int diff= FFABS(src[j] - ref[j]);
3259
3260             error+= diff*diff;
3261             max_error= FFMAX(max_error, diff);
3262         }
3263     }
3264     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
3265     printf("testing quantizer\n");
3266     for(qp=0; qp<52; qp++){
3267         for(i=0; i<16; i++)
3268             src1_block[i]= src2_block[i]= random()%255;
3269
3270     }
3271     printf("Testing NAL layer\n");
3272
3273     uint8_t bitstream[COUNT];
3274     uint8_t nal[COUNT*2];
3275     H264Context h;
3276     memset(&h, 0, sizeof(H264Context));
3277
3278     for(i=0; i<COUNT; i++){
3279         int zeros= i;
3280         int nal_length;
3281         int consumed;
3282         int out_length;
3283         uint8_t *out;
3284         int j;
3285
3286         for(j=0; j<COUNT; j++){
3287             bitstream[j]= (random() % 255) + 1;
3288         }
3289
3290         for(j=0; j<zeros; j++){
3291             int pos= random() % COUNT;
3292             while(bitstream[pos] == 0){
3293                 pos++;
3294                 pos %= COUNT;
3295             }
3296             bitstream[pos]=0;
3297         }
3298
3299         START_TIMER
3300
3301         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
3302         if(nal_length<0){
3303             printf("encoding failed\n");
3304             return -1;
3305         }
3306
3307         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
3308
3309         STOP_TIMER("NAL")
3310
3311         if(out_length != COUNT){
3312             printf("incorrect length %d %d\n", out_length, COUNT);
3313             return -1;
3314         }
3315
3316         if(consumed != nal_length){
3317             printf("incorrect consumed length %d %d\n", nal_length, consumed);
3318             return -1;
3319         }
3320
3321         if(memcmp(bitstream, out, COUNT)){
3322             printf("mismatch\n");
3323             return -1;
3324         }
3325     }
3326 #endif
3327
3328     printf("Testing RBSP\n");
3329
3330
3331     return 0;
3332 }
3333 #endif /* TEST */
3334
3335
3336 av_cold void ff_h264_free_context(H264Context *h)
3337 {
3338     int i;
3339
3340     free_tables(h, 1); //FIXME cleanup init stuff perhaps
3341
3342     for(i = 0; i < MAX_SPS_COUNT; i++)
3343         av_freep(h->sps_buffers + i);
3344
3345     for(i = 0; i < MAX_PPS_COUNT; i++)
3346         av_freep(h->pps_buffers + i);
3347 }
3348
3349 av_cold int ff_h264_decode_end(AVCodecContext *avctx)
3350 {
3351     H264Context *h = avctx->priv_data;
3352     MpegEncContext *s = &h->s;
3353
3354     ff_h264_free_context(h);
3355
3356     MPV_common_end(s);
3357
3358 //    memset(h, 0, sizeof(H264Context));
3359
3360     return 0;
3361 }
3362
3363
3364 AVCodec h264_decoder = {
3365     "h264",
3366     AVMEDIA_TYPE_VIDEO,
3367     CODEC_ID_H264,
3368     sizeof(H264Context),
3369     ff_h264_decode_init,
3370     NULL,
3371     ff_h264_decode_end,
3372     decode_frame,
3373     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
3374     .flush= flush_dpb,
3375     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
3376 };
3377
3378 #if CONFIG_H264_VDPAU_DECODER
3379 AVCodec h264_vdpau_decoder = {
3380     "h264_vdpau",
3381     AVMEDIA_TYPE_VIDEO,
3382     CODEC_ID_H264,
3383     sizeof(H264Context),
3384     ff_h264_decode_init,
3385     NULL,
3386     ff_h264_decode_end,
3387     decode_frame,
3388     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
3389     .flush= flush_dpb,
3390     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
3391     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_VDPAU_H264, PIX_FMT_NONE},
3392 };
3393 #endif