git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #if HAVE_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000         av_freep(&hx->rbsp_buffer[1]);
2001         av_freep(&hx->rbsp_buffer[0]);
2002         if (i) av_freep(&h->thread_context[i]);
2003     }
2004 }
2005
2006 static void init_dequant8_coeff_table(H264Context *h){
2007     int i,q,x;
2008     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2009     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2010     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2011
2012     for(i=0; i<2; i++ ){
2013         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2014             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2015             break;
2016         }
2017
2018         for(q=0; q<52; q++){
2019             int shift = div6[q];
2020             int idx = rem6[q];
2021             for(x=0; x<64; x++)
2022                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2023                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2024                     h->pps.scaling_matrix8[i][x]) << shift;
2025         }
2026     }
2027 }
2028
2029 static void init_dequant4_coeff_table(H264Context *h){
2030     int i,j,q,x;
2031     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2032     for(i=0; i<6; i++ ){
2033         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2034         for(j=0; j<i; j++){
2035             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2036                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2037                 break;
2038             }
2039         }
2040         if(j<i)
2041             continue;
2042
2043         for(q=0; q<52; q++){
2044             int shift = div6[q] + 2;
2045             int idx = rem6[q];
2046             for(x=0; x<16; x++)
2047                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2048                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2049                     h->pps.scaling_matrix4[i][x]) << shift;
2050         }
2051     }
2052 }
2053
2054 static void init_dequant_tables(H264Context *h){
2055     int i,x;
2056     init_dequant4_coeff_table(h);
2057     if(h->pps.transform_8x8_mode)
2058         init_dequant8_coeff_table(h);
2059     if(h->sps.transform_bypass){
2060         for(i=0; i<6; i++)
2061             for(x=0; x<16; x++)
2062                 h->dequant4_coeff[i][0][x] = 1<<6;
2063         if(h->pps.transform_8x8_mode)
2064             for(i=0; i<2; i++)
2065                 for(x=0; x<64; x++)
2066                     h->dequant8_coeff[i][0][x] = 1<<6;
2067     }
2068 }
2069
2070
2071 /**
2072  * allocates tables.
2073  * needs width/height
2074  */
2075 static int alloc_tables(H264Context *h){
2076     MpegEncContext * const s = &h->s;
2077     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2078     int x,y;
2079
2080     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t), fail)
2081
2082     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t), fail)
2083     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
2084     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
2085
2086     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
2087     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
2088     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
2089     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
2090
2091     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2092     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2093
2094     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy  , big_mb_num * sizeof(uint32_t), fail);
2095     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b8_xy , big_mb_num * sizeof(uint32_t), fail);
2096     for(y=0; y<s->mb_height; y++){
2097         for(x=0; x<s->mb_width; x++){
2098             const int mb_xy= x + y*s->mb_stride;
2099             const int b_xy = 4*x + 4*y*h->b_stride;
2100             const int b8_xy= 2*x + 2*y*h->b8_stride;
2101
2102             h->mb2b_xy [mb_xy]= b_xy;
2103             h->mb2b8_xy[mb_xy]= b8_xy;
2104         }
2105     }
2106
2107     s->obmc_scratchpad = NULL;
2108
2109     if(!h->dequant4_coeff[0])
2110         init_dequant_tables(h);
2111
2112     return 0;
2113 fail:
2114     free_tables(h);
2115     return -1;
2116 }
2117
2118 /**
2119  * Mimic alloc_tables(), but for every context thread.
2120  */
2121 static void clone_tables(H264Context *dst, H264Context *src){
2122     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2123     dst->non_zero_count           = src->non_zero_count;
2124     dst->slice_table              = src->slice_table;
2125     dst->cbp_table                = src->cbp_table;
2126     dst->mb2b_xy                  = src->mb2b_xy;
2127     dst->mb2b8_xy                 = src->mb2b8_xy;
2128     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2129     dst->mvd_table[0]             = src->mvd_table[0];
2130     dst->mvd_table[1]             = src->mvd_table[1];
2131     dst->direct_table             = src->direct_table;
2132
2133     dst->s.obmc_scratchpad = NULL;
2134     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2135 }
2136
2137 /**
2138  * Init context
2139  * Allocate buffers which are not shared amongst multiple threads.
2140  */
2141 static int context_init(H264Context *h){
2142     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2143     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2144
2145     return 0;
2146 fail:
2147     return -1; // free_tables will clean up for us
2148 }
2149
2150 static av_cold void common_init(H264Context *h){
2151     MpegEncContext * const s = &h->s;
2152
2153     s->width = s->avctx->width;
2154     s->height = s->avctx->height;
2155     s->codec_id= s->avctx->codec->id;
2156
2157     ff_h264_pred_init(&h->hpc, s->codec_id);
2158
2159     h->dequant_coeff_pps= -1;
2160     s->unrestricted_mv=1;
2161     s->decode=1; //FIXME
2162
2163     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2164
2165     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2166     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2167 }
2168
2169 /**
2170  * Reset SEI values at the beginning of the frame.
2171  *
2172  * @param h H.264 context.
2173  */
2174 static void reset_sei(H264Context *h) {
2175     h->sei_recovery_frame_cnt       = -1;
2176     h->sei_dpb_output_delay         =  0;
2177     h->sei_cpb_removal_delay        = -1;
2178     h->sei_buffering_period_present =  0;
2179 }
2180
2181 static av_cold int decode_init(AVCodecContext *avctx){
2182     H264Context *h= avctx->priv_data;
2183     MpegEncContext * const s = &h->s;
2184
2185     MPV_decode_defaults(s);
2186
2187     s->avctx = avctx;
2188     common_init(h);
2189
2190     s->out_format = FMT_H264;
2191     s->workaround_bugs= avctx->workaround_bugs;
2192
2193     // set defaults
2194 //    s->decode_mb= ff_h263_decode_mb;
2195     s->quarter_sample = 1;
2196     if(!avctx->has_b_frames)
2197     s->low_delay= 1;
2198
2199     avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2200     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2201     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
2202
2203     decode_init_vlc();
2204
2205     if(avctx->extradata_size > 0 && avctx->extradata &&
2206        *(char *)avctx->extradata == 1){
2207         h->is_avc = 1;
2208         h->got_avcC = 0;
2209     } else {
2210         h->is_avc = 0;
2211     }
2212
2213     h->thread_context[0] = h;
2214     h->outputed_poc = INT_MIN;
2215     h->prev_poc_msb= 1<<16;
2216     reset_sei(h);
2217     if(avctx->codec_id == CODEC_ID_H264){
2218         if(avctx->ticks_per_frame == 1){
2219             s->avctx->time_base.den *=2;
2220         }
2221         avctx->ticks_per_frame = 2;
2222     }
2223     return 0;
2224 }
2225
2226 static int frame_start(H264Context *h){
2227     MpegEncContext * const s = &h->s;
2228     int i;
2229
2230     if(MPV_frame_start(s, s->avctx) < 0)
2231         return -1;
2232     ff_er_frame_start(s);
2233     /*
2234      * MPV_frame_start uses pict_type to derive key_frame.
2235      * This is incorrect for H.264; IDR markings must be used.
2236      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2237      * See decode_nal_units().
2238      */
2239     s->current_picture_ptr->key_frame= 0;
2240     s->current_picture_ptr->mmco_reset= 0;
2241
2242     assert(s->linesize && s->uvlinesize);
2243
2244     for(i=0; i<16; i++){
2245         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2246         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2247     }
2248     for(i=0; i<4; i++){
2249         h->block_offset[16+i]=
2250         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2251         h->block_offset[24+16+i]=
2252         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2253     }
2254
2255     /* can't be in alloc_tables because linesize isn't known there.
2256      * FIXME: redo bipred weight to not require extra buffer? */
2257     for(i = 0; i < s->avctx->thread_count; i++)
2258         if(!h->thread_context[i]->s.obmc_scratchpad)
2259             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2260
2261     /* some macroblocks will be accessed before they're available */
2262     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2263         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2264
2265 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2266
2267     // We mark the current picture as non-reference after allocating it, so
2268     // that if we break out due to an error it can be released automatically
2269     // in the next MPV_frame_start().
2270     // SVQ3 as well as most other codecs have only last/next/current and thus
2271     // get released even with set reference, besides SVQ3 and others do not
2272     // mark frames as reference later "naturally".
2273     if(s->codec_id != CODEC_ID_SVQ3)
2274         s->current_picture_ptr->reference= 0;
2275
2276     s->current_picture_ptr->field_poc[0]=
2277     s->current_picture_ptr->field_poc[1]= INT_MAX;
2278     assert(s->current_picture_ptr->long_ref==0);
2279
2280     return 0;
2281 }
2282
2283 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2284     MpegEncContext * const s = &h->s;
2285     int i;
2286     int step    = 1;
2287     int offset  = 1;
2288     int uvoffset= 1;
2289     int top_idx = 1;
2290     int skiplast= 0;
2291
2292     src_y  -=   linesize;
2293     src_cb -= uvlinesize;
2294     src_cr -= uvlinesize;
2295
2296     if(!simple && FRAME_MBAFF){
2297         if(s->mb_y&1){
2298             offset  = MB_MBAFF ? 1 : 17;
2299             uvoffset= MB_MBAFF ? 1 : 9;
2300             if(!MB_MBAFF){
2301                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2302                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2303                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2304                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2305                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2306                 }
2307             }
2308         }else{
2309             if(!MB_MBAFF){
2310                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2311                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2312                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2313                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2314                 }
2315                 skiplast= 1;
2316             }
2317             offset  =
2318             uvoffset=
2319             top_idx = MB_MBAFF ? 0 : 1;
2320         }
2321         step= MB_MBAFF ? 2 : 1;
2322     }
2323
2324     // There are two lines saved, the line above the the top macroblock of a pair,
2325     // and the line above the bottom macroblock
2326     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2327     for(i=1; i<17 - skiplast; i++){
2328         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2329     }
2330
2331     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2332     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2333
2334     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2335         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2336         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2337         for(i=1; i<9 - skiplast; i++){
2338             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2339             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2340         }
2341         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2342         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2343     }
2344 }
2345
2346 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2347     MpegEncContext * const s = &h->s;
2348     int temp8, i;
2349     uint64_t temp64;
2350     int deblock_left;
2351     int deblock_top;
2352     int mb_xy;
2353     int step    = 1;
2354     int offset  = 1;
2355     int uvoffset= 1;
2356     int top_idx = 1;
2357
2358     if(!simple && FRAME_MBAFF){
2359         if(s->mb_y&1){
2360             offset  = MB_MBAFF ? 1 : 17;
2361             uvoffset= MB_MBAFF ? 1 : 9;
2362         }else{
2363             offset  =
2364             uvoffset=
2365             top_idx = MB_MBAFF ? 0 : 1;
2366         }
2367         step= MB_MBAFF ? 2 : 1;
2368     }
2369
2370     if(h->deblocking_filter == 2) {
2371         mb_xy = h->mb_xy;
2372         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2373         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2374     } else {
2375         deblock_left = (s->mb_x > 0);
2376         deblock_top =  (s->mb_y > !!MB_FIELD);
2377     }
2378
2379     src_y  -=   linesize + 1;
2380     src_cb -= uvlinesize + 1;
2381     src_cr -= uvlinesize + 1;
2382
2383 #define XCHG(a,b,t,xchg)\
2384 t= a;\
2385 if(xchg)\
2386     a= b;\
2387 b= t;
2388
2389     if(deblock_left){
2390         for(i = !deblock_top; i<16; i++){
2391             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2392         }
2393         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2394     }
2395
2396     if(deblock_top){
2397         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2398         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2399         if(s->mb_x+1 < s->mb_width){
2400             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2401         }
2402     }
2403
2404     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2405         if(deblock_left){
2406             for(i = !deblock_top; i<8; i++){
2407                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2408                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2409             }
2410             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2411             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2412         }
2413         if(deblock_top){
2414             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2415             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2416         }
2417     }
2418 }
2419
2420 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2421     MpegEncContext * const s = &h->s;
2422     const int mb_x= s->mb_x;
2423     const int mb_y= s->mb_y;
2424     const int mb_xy= h->mb_xy;
2425     const int mb_type= s->current_picture.mb_type[mb_xy];
2426     uint8_t  *dest_y, *dest_cb, *dest_cr;
2427     int linesize, uvlinesize /*dct_offset*/;
2428     int i;
2429     int *block_offset = &h->block_offset[0];
2430     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2431     /* is_h264 should always be true if SVQ3 is disabled. */
2432     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2433     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2434     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2435
2436     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2437     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2438     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2439
2440     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2441     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2442
2443     if (!simple && MB_FIELD) {
2444         linesize   = h->mb_linesize   = s->linesize * 2;
2445         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2446         block_offset = &h->block_offset[24];
2447         if(mb_y&1){ //FIXME move out of this function?
2448             dest_y -= s->linesize*15;
2449             dest_cb-= s->uvlinesize*7;
2450             dest_cr-= s->uvlinesize*7;
2451         }
2452         if(FRAME_MBAFF) {
2453             int list;
2454             for(list=0; list<h->list_count; list++){
2455                 if(!USES_LIST(mb_type, list))
2456                     continue;
2457                 if(IS_16X16(mb_type)){
2458                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2459                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2460                 }else{
2461                     for(i=0; i<16; i+=4){
2462                         int ref = h->ref_cache[list][scan8[i]];
2463                         if(ref >= 0)
2464                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2465                     }
2466                 }
2467             }
2468         }
2469     } else {
2470         linesize   = h->mb_linesize   = s->linesize;
2471         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2472 //        dct_offset = s->linesize * 16;
2473     }
2474
2475     if (!simple && IS_INTRA_PCM(mb_type)) {
2476         for (i=0; i<16; i++) {
2477             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2478         }
2479         for (i=0; i<8; i++) {
2480             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2481             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2482         }
2483     } else {
2484         if(IS_INTRA(mb_type)){
2485             if(h->deblocking_filter)
2486                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2487
2488             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2489                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2490                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2491             }
2492
2493             if(IS_INTRA4x4(mb_type)){
2494                 if(simple || !s->encoding){
2495                     if(IS_8x8DCT(mb_type)){
2496                         if(transform_bypass){
2497                             idct_dc_add =
2498                             idct_add    = s->dsp.add_pixels8;
2499                         }else{
2500                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2501                             idct_add    = s->dsp.h264_idct8_add;
2502                         }
2503                         for(i=0; i<16; i+=4){
2504                             uint8_t * const ptr= dest_y + block_offset[i];
2505                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2506                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2507                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2508                             }else{
2509                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2510                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2511                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2512                                 if(nnz){
2513                                     if(nnz == 1 && h->mb[i*16])
2514                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2515                                     else
2516                                         idct_add   (ptr, h->mb + i*16, linesize);
2517                                 }
2518                             }
2519                         }
2520                     }else{
2521                         if(transform_bypass){
2522                             idct_dc_add =
2523                             idct_add    = s->dsp.add_pixels4;
2524                         }else{
2525                             idct_dc_add = s->dsp.h264_idct_dc_add;
2526                             idct_add    = s->dsp.h264_idct_add;
2527                         }
2528                         for(i=0; i<16; i++){
2529                             uint8_t * const ptr= dest_y + block_offset[i];
2530                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2531
2532                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2533                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2534                             }else{
2535                                 uint8_t *topright;
2536                                 int nnz, tr;
2537                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2538                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2539                                     assert(mb_y || linesize <= block_offset[i]);
2540                                     if(!topright_avail){
2541                                         tr= ptr[3 - linesize]*0x01010101;
2542                                         topright= (uint8_t*) &tr;
2543                                     }else
2544                                         topright= ptr + 4 - linesize;
2545                                 }else
2546                                     topright= NULL;
2547
2548                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2549                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2550                                 if(nnz){
2551                                     if(is_h264){
2552                                         if(nnz == 1 && h->mb[i*16])
2553                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2554                                         else
2555                                             idct_add   (ptr, h->mb + i*16, linesize);
2556                                     }else
2557                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2558                                 }
2559                             }
2560                         }
2561                     }
2562                 }
2563             }else{
2564                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2565                 if(is_h264){
2566                     if(!transform_bypass)
2567                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2568                 }else
2569                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2570             }
2571             if(h->deblocking_filter)
2572                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2573         }else if(is_h264){
2574             hl_motion(h, dest_y, dest_cb, dest_cr,
2575                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2576                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2577                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2578         }
2579
2580
2581         if(!IS_INTRA4x4(mb_type)){
2582             if(is_h264){
2583                 if(IS_INTRA16x16(mb_type)){
2584                     if(transform_bypass){
2585                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2586                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2587                         }else{
2588                             for(i=0; i<16; i++){
2589                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2590                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2591                             }
2592                         }
2593                     }else{
2594                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2595                     }
2596                 }else if(h->cbp&15){
2597                     if(transform_bypass){
2598                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2599                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2600                         for(i=0; i<16; i+=di){
2601                             if(h->non_zero_count_cache[ scan8[i] ]){
2602                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2603                             }
2604                         }
2605                     }else{
2606                         if(IS_8x8DCT(mb_type)){
2607                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2608                         }else{
2609                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2610                         }
2611                     }
2612                 }
2613             }else{
2614                 for(i=0; i<16; i++){
2615                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2616                         uint8_t * const ptr= dest_y + block_offset[i];
2617                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2618                     }
2619                 }
2620             }
2621         }
2622
2623         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2624             uint8_t *dest[2] = {dest_cb, dest_cr};
2625             if(transform_bypass){
2626                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2627                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2628                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2629                 }else{
2630                     idct_add = s->dsp.add_pixels4;
2631                     for(i=16; i<16+8; i++){
2632                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2633                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2634                     }
2635                 }
2636             }else{
2637                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2638                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2639                 if(is_h264){
2640                     idct_add = s->dsp.h264_idct_add;
2641                     idct_dc_add = s->dsp.h264_idct_dc_add;
2642                     for(i=16; i<16+8; i++){
2643                         if(h->non_zero_count_cache[ scan8[i] ])
2644                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2645                         else if(h->mb[i*16])
2646                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2647                     }
2648                 }else{
2649                     for(i=16; i<16+8; i++){
2650                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2651                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2652                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2653                         }
2654                     }
2655                 }
2656             }
2657         }
2658     }
2659     if(h->cbp || IS_INTRA(mb_type))
2660         s->dsp.clear_blocks(h->mb);
2661
2662     if(h->deblocking_filter) {
2663         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2664         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2665         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2666         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2667         if (!simple && FRAME_MBAFF) {
2668             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2669         } else {
2670             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2671         }
2672     }
2673 }
2674
2675 /**
2676  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2677  */
2678 static void hl_decode_mb_simple(H264Context *h){
2679     hl_decode_mb_internal(h, 1);
2680 }
2681
2682 /**
2683  * Process a macroblock; this handles edge cases, such as interlacing.
2684  */
2685 static void av_noinline hl_decode_mb_complex(H264Context *h){
2686     hl_decode_mb_internal(h, 0);
2687 }
2688
2689 static void hl_decode_mb(H264Context *h){
2690     MpegEncContext * const s = &h->s;
2691     const int mb_xy= h->mb_xy;
2692     const int mb_type= s->current_picture.mb_type[mb_xy];
2693     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2694
2695     if (is_complex)
2696         hl_decode_mb_complex(h);
2697     else hl_decode_mb_simple(h);
2698 }
2699
2700 static void pic_as_field(Picture *pic, const int parity){
2701     int i;
2702     for (i = 0; i < 4; ++i) {
2703         if (parity == PICT_BOTTOM_FIELD)
2704             pic->data[i] += pic->linesize[i];
2705         pic->reference = parity;
2706         pic->linesize[i] *= 2;
2707     }
2708     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2709 }
2710
2711 static int split_field_copy(Picture *dest, Picture *src,
2712                             int parity, int id_add){
2713     int match = !!(src->reference & parity);
2714
2715     if (match) {
2716         *dest = *src;
2717         if(parity != PICT_FRAME){
2718             pic_as_field(dest, parity);
2719             dest->pic_id *= 2;
2720             dest->pic_id += id_add;
2721         }
2722     }
2723
2724     return match;
2725 }
2726
2727 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2728     int i[2]={0};
2729     int index=0;
2730
2731     while(i[0]<len || i[1]<len){
2732         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2733             i[0]++;
2734         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2735             i[1]++;
2736         if(i[0] < len){
2737             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2738             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2739         }
2740         if(i[1] < len){
2741             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2742             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2743         }
2744     }
2745
2746     return index;
2747 }
2748
2749 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2750     int i, best_poc;
2751     int out_i= 0;
2752
2753     for(;;){
2754         best_poc= dir ? INT_MIN : INT_MAX;
2755
2756         for(i=0; i<len; i++){
2757             const int poc= src[i]->poc;
2758             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2759                 best_poc= poc;
2760                 sorted[out_i]= src[i];
2761             }
2762         }
2763         if(best_poc == (dir ? INT_MIN : INT_MAX))
2764             break;
2765         limit= sorted[out_i++]->poc - dir;
2766     }
2767     return out_i;
2768 }
2769
2770 /**
2771  * fills the default_ref_list.
2772  */
2773 static int fill_default_ref_list(H264Context *h){
2774     MpegEncContext * const s = &h->s;
2775     int i, len;
2776
2777     if(h->slice_type_nos==FF_B_TYPE){
2778         Picture *sorted[32];
2779         int cur_poc, list;
2780         int lens[2];
2781
2782         if(FIELD_PICTURE)
2783             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2784         else
2785             cur_poc= s->current_picture_ptr->poc;
2786
2787         for(list= 0; list<2; list++){
2788             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2789             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2790             assert(len<=32);
2791             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2792             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2793             assert(len<=32);
2794
2795             if(len < h->ref_count[list])
2796                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2797             lens[list]= len;
2798         }
2799
2800         if(lens[0] == lens[1] && lens[1] > 1){
2801             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2802             if(i == lens[0])
2803                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2804         }
2805     }else{
2806         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2807         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2808         assert(len <= 32);
2809         if(len < h->ref_count[0])
2810             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2811     }
2812 #ifdef TRACE
2813     for (i=0; i<h->ref_count[0]; i++) {
2814         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2815     }
2816     if(h->slice_type_nos==FF_B_TYPE){
2817         for (i=0; i<h->ref_count[1]; i++) {
2818             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2819         }
2820     }
2821 #endif
2822     return 0;
2823 }
2824
2825 static void print_short_term(H264Context *h);
2826 static void print_long_term(H264Context *h);
2827
2828 /**
2829  * Extract structure information about the picture described by pic_num in
2830  * the current decoding context (frame or field). Note that pic_num is
2831  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2832  * @param pic_num picture number for which to extract structure information
2833  * @param structure one of PICT_XXX describing structure of picture
2834  *                      with pic_num
2835  * @return frame number (short term) or long term index of picture
2836  *         described by pic_num
2837  */
2838 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2839     MpegEncContext * const s = &h->s;
2840
2841     *structure = s->picture_structure;
2842     if(FIELD_PICTURE){
2843         if (!(pic_num & 1))
2844             /* opposite field */
2845             *structure ^= PICT_FRAME;
2846         pic_num >>= 1;
2847     }
2848
2849     return pic_num;
2850 }
2851
2852 static int decode_ref_pic_list_reordering(H264Context *h){
2853     MpegEncContext * const s = &h->s;
2854     int list, index, pic_structure;
2855
2856     print_short_term(h);
2857     print_long_term(h);
2858
2859     for(list=0; list<h->list_count; list++){
2860         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2861
2862         if(get_bits1(&s->gb)){
2863             int pred= h->curr_pic_num;
2864
2865             for(index=0; ; index++){
2866                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2867                 unsigned int pic_id;
2868                 int i;
2869                 Picture *ref = NULL;
2870
2871                 if(reordering_of_pic_nums_idc==3)
2872                     break;
2873
2874                 if(index >= h->ref_count[list]){
2875                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2876                     return -1;
2877                 }
2878
2879                 if(reordering_of_pic_nums_idc<3){
2880                     if(reordering_of_pic_nums_idc<2){
2881                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2882                         int frame_num;
2883
2884                         if(abs_diff_pic_num > h->max_pic_num){
2885                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2886                             return -1;
2887                         }
2888
2889                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2890                         else                                pred+= abs_diff_pic_num;
2891                         pred &= h->max_pic_num - 1;
2892
2893                         frame_num = pic_num_extract(h, pred, &pic_structure);
2894
2895                         for(i= h->short_ref_count-1; i>=0; i--){
2896                             ref = h->short_ref[i];
2897                             assert(ref->reference);
2898                             assert(!ref->long_ref);
2899                             if(
2900                                    ref->frame_num == frame_num &&
2901                                    (ref->reference & pic_structure)
2902                               )
2903                                 break;
2904                         }
2905                         if(i>=0)
2906                             ref->pic_id= pred;
2907                     }else{
2908                         int long_idx;
2909                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2910
2911                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2912
2913                         if(long_idx>31){
2914                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2915                             return -1;
2916                         }
2917                         ref = h->long_ref[long_idx];
2918                         assert(!(ref && !ref->reference));
2919                         if(ref && (ref->reference & pic_structure)){
2920                             ref->pic_id= pic_id;
2921                             assert(ref->long_ref);
2922                             i=0;
2923                         }else{
2924                             i=-1;
2925                         }
2926                     }
2927
2928                     if (i < 0) {
2929                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2930                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2931                     } else {
2932                         for(i=index; i+1<h->ref_count[list]; i++){
2933                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2934                                 break;
2935                         }
2936                         for(; i > index; i--){
2937                             h->ref_list[list][i]= h->ref_list[list][i-1];
2938                         }
2939                         h->ref_list[list][index]= *ref;
2940                         if (FIELD_PICTURE){
2941                             pic_as_field(&h->ref_list[list][index], pic_structure);
2942                         }
2943                     }
2944                 }else{
2945                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2946                     return -1;
2947                 }
2948             }
2949         }
2950     }
2951     for(list=0; list<h->list_count; list++){
2952         for(index= 0; index < h->ref_count[list]; index++){
2953             if(!h->ref_list[list][index].data[0]){
2954                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2955                 if(h->default_ref_list[list][0].data[0])
2956                     h->ref_list[list][index]= h->default_ref_list[list][0];
2957                 else
2958                     return -1;
2959             }
2960         }
2961     }
2962
2963     return 0;
2964 }
2965
2966 static void fill_mbaff_ref_list(H264Context *h){
2967     int list, i, j;
2968     for(list=0; list<2; list++){ //FIXME try list_count
2969         for(i=0; i<h->ref_count[list]; i++){
2970             Picture *frame = &h->ref_list[list][i];
2971             Picture *field = &h->ref_list[list][16+2*i];
2972             field[0] = *frame;
2973             for(j=0; j<3; j++)
2974                 field[0].linesize[j] <<= 1;
2975             field[0].reference = PICT_TOP_FIELD;
2976             field[0].poc= field[0].field_poc[0];
2977             field[1] = field[0];
2978             for(j=0; j<3; j++)
2979                 field[1].data[j] += frame->linesize[j];
2980             field[1].reference = PICT_BOTTOM_FIELD;
2981             field[1].poc= field[1].field_poc[1];
2982
2983             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2984             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2985             for(j=0; j<2; j++){
2986                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2987                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2988             }
2989         }
2990     }
2991     for(j=0; j<h->ref_count[1]; j++){
2992         for(i=0; i<h->ref_count[0]; i++)
2993             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2994         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2995         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2996     }
2997 }
2998
2999 static int pred_weight_table(H264Context *h){
3000     MpegEncContext * const s = &h->s;
3001     int list, i;
3002     int luma_def, chroma_def;
3003
3004     h->use_weight= 0;
3005     h->use_weight_chroma= 0;
3006     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3007     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3008     luma_def = 1<<h->luma_log2_weight_denom;
3009     chroma_def = 1<<h->chroma_log2_weight_denom;
3010
3011     for(list=0; list<2; list++){
3012         h->luma_weight_flag[list]   = 0;
3013         h->chroma_weight_flag[list] = 0;
3014         for(i=0; i<h->ref_count[list]; i++){
3015             int luma_weight_flag, chroma_weight_flag;
3016
3017             luma_weight_flag= get_bits1(&s->gb);
3018             if(luma_weight_flag){
3019                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3020                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3021                 if(   h->luma_weight[list][i] != luma_def
3022                    || h->luma_offset[list][i] != 0) {
3023                     h->use_weight= 1;
3024                     h->luma_weight_flag[list]= 1;
3025                 }
3026             }else{
3027                 h->luma_weight[list][i]= luma_def;
3028                 h->luma_offset[list][i]= 0;
3029             }
3030
3031             if(CHROMA){
3032                 chroma_weight_flag= get_bits1(&s->gb);
3033                 if(chroma_weight_flag){
3034                     int j;
3035                     for(j=0; j<2; j++){
3036                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3037                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3038                         if(   h->chroma_weight[list][i][j] != chroma_def
3039                            || h->chroma_offset[list][i][j] != 0) {
3040                             h->use_weight_chroma= 1;
3041                             h->chroma_weight_flag[list]= 1;
3042                         }
3043                     }
3044                 }else{
3045                     int j;
3046                     for(j=0; j<2; j++){
3047                         h->chroma_weight[list][i][j]= chroma_def;
3048                         h->chroma_offset[list][i][j]= 0;
3049                     }
3050                 }
3051             }
3052         }
3053         if(h->slice_type_nos != FF_B_TYPE) break;
3054     }
3055     h->use_weight= h->use_weight || h->use_weight_chroma;
3056     return 0;
3057 }
3058
3059 static void implicit_weight_table(H264Context *h){
3060     MpegEncContext * const s = &h->s;
3061     int ref0, ref1, i;
3062     int cur_poc = s->current_picture_ptr->poc;
3063
3064     for (i = 0; i < 2; i++) {
3065         h->luma_weight_flag[i]   = 0;
3066         h->chroma_weight_flag[i] = 0;
3067     }
3068
3069     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3070        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3071         h->use_weight= 0;
3072         h->use_weight_chroma= 0;
3073         return;
3074     }
3075
3076     h->use_weight= 2;
3077     h->use_weight_chroma= 2;
3078     h->luma_log2_weight_denom= 5;
3079     h->chroma_log2_weight_denom= 5;
3080
3081     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3082         int poc0 = h->ref_list[0][ref0].poc;
3083         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3084             int poc1 = h->ref_list[1][ref1].poc;
3085             int td = av_clip(poc1 - poc0, -128, 127);
3086             if(td){
3087                 int tb = av_clip(cur_poc - poc0, -128, 127);
3088                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3089                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3090                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3091                     h->implicit_weight[ref0][ref1] = 32;
3092                 else
3093                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3094             }else
3095                 h->implicit_weight[ref0][ref1] = 32;
3096         }
3097     }
3098 }
3099
3100 /**
3101  * Mark a picture as no longer needed for reference. The refmask
3102  * argument allows unreferencing of individual fields or the whole frame.
3103  * If the picture becomes entirely unreferenced, but is being held for
3104  * display purposes, it is marked as such.
3105  * @param refmask mask of fields to unreference; the mask is bitwise
3106  *                anded with the reference marking of pic
3107  * @return non-zero if pic becomes entirely unreferenced (except possibly
3108  *         for display purposes) zero if one of the fields remains in
3109  *         reference
3110  */
3111 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3112     int i;
3113     if (pic->reference &= refmask) {
3114         return 0;
3115     } else {
3116         for(i = 0; h->delayed_pic[i]; i++)
3117             if(pic == h->delayed_pic[i]){
3118                 pic->reference=DELAYED_PIC_REF;
3119                 break;
3120             }
3121         return 1;
3122     }
3123 }
3124
3125 /**
3126  * instantaneous decoder refresh.
3127  */
3128 static void idr(H264Context *h){
3129     int i;
3130
3131     for(i=0; i<16; i++){
3132         remove_long(h, i, 0);
3133     }
3134     assert(h->long_ref_count==0);
3135
3136     for(i=0; i<h->short_ref_count; i++){
3137         unreference_pic(h, h->short_ref[i], 0);
3138         h->short_ref[i]= NULL;
3139     }
3140     h->short_ref_count=0;
3141     h->prev_frame_num= 0;
3142     h->prev_frame_num_offset= 0;
3143     h->prev_poc_msb=
3144     h->prev_poc_lsb= 0;
3145 }
3146
3147 /* forget old pics after a seek */
3148 static void flush_dpb(AVCodecContext *avctx){
3149     H264Context *h= avctx->priv_data;
3150     int i;
3151     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3152         if(h->delayed_pic[i])
3153             h->delayed_pic[i]->reference= 0;
3154         h->delayed_pic[i]= NULL;
3155     }
3156     h->outputed_poc= INT_MIN;
3157     h->prev_interlaced_frame = 1;
3158     idr(h);
3159     if(h->s.current_picture_ptr)
3160         h->s.current_picture_ptr->reference= 0;
3161     h->s.first_field= 0;
3162     reset_sei(h);
3163     ff_mpeg_flush(avctx);
3164 }
3165
3166 /**
3167  * Find a Picture in the short term reference list by frame number.
3168  * @param frame_num frame number to search for
3169  * @param idx the index into h->short_ref where returned picture is found
3170  *            undefined if no picture found.
3171  * @return pointer to the found picture, or NULL if no pic with the provided
3172  *                 frame number is found
3173  */
3174 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3175     MpegEncContext * const s = &h->s;
3176     int i;
3177
3178     for(i=0; i<h->short_ref_count; i++){
3179         Picture *pic= h->short_ref[i];
3180         if(s->avctx->debug&FF_DEBUG_MMCO)
3181             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3182         if(pic->frame_num == frame_num) {
3183             *idx = i;
3184             return pic;
3185         }
3186     }
3187     return NULL;
3188 }
3189
3190 /**
3191  * Remove a picture from the short term reference list by its index in
3192  * that list.  This does no checking on the provided index; it is assumed
3193  * to be valid. Other list entries are shifted down.
3194  * @param i index into h->short_ref of picture to remove.
3195  */
3196 static void remove_short_at_index(H264Context *h, int i){
3197     assert(i >= 0 && i < h->short_ref_count);
3198     h->short_ref[i]= NULL;
3199     if (--h->short_ref_count)
3200         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3201 }
3202
3203 /**
3204  *
3205  * @return the removed picture or NULL if an error occurs
3206  */
3207 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3208     MpegEncContext * const s = &h->s;
3209     Picture *pic;
3210     int i;
3211
3212     if(s->avctx->debug&FF_DEBUG_MMCO)
3213         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3214
3215     pic = find_short(h, frame_num, &i);
3216     if (pic){
3217         if(unreference_pic(h, pic, ref_mask))
3218         remove_short_at_index(h, i);
3219     }
3220
3221     return pic;
3222 }
3223
3224 /**
3225  * Remove a picture from the long term reference list by its index in
3226  * that list.
3227  * @return the removed picture or NULL if an error occurs
3228  */
3229 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3230     Picture *pic;
3231
3232     pic= h->long_ref[i];
3233     if (pic){
3234         if(unreference_pic(h, pic, ref_mask)){
3235             assert(h->long_ref[i]->long_ref == 1);
3236             h->long_ref[i]->long_ref= 0;
3237             h->long_ref[i]= NULL;
3238             h->long_ref_count--;
3239         }
3240     }
3241
3242     return pic;
3243 }
3244
3245 /**
3246  * print short term list
3247  */
3248 static void print_short_term(H264Context *h) {
3249     uint32_t i;
3250     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3251         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3252         for(i=0; i<h->short_ref_count; i++){
3253             Picture *pic= h->short_ref[i];
3254             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3255         }
3256     }
3257 }
3258
3259 /**
3260  * print long term list
3261  */
3262 static void print_long_term(H264Context *h) {
3263     uint32_t i;
3264     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3265         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3266         for(i = 0; i < 16; i++){
3267             Picture *pic= h->long_ref[i];
3268             if (pic) {
3269                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3270             }
3271         }
3272     }
3273 }
3274
3275 /**
3276  * Executes the reference picture marking (memory management control operations).
3277  */
3278 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3279     MpegEncContext * const s = &h->s;
3280     int i, av_uninit(j);
3281     int current_ref_assigned=0;
3282     Picture *av_uninit(pic);
3283
3284     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3285         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3286
3287     for(i=0; i<mmco_count; i++){
3288         int av_uninit(structure), av_uninit(frame_num);
3289         if(s->avctx->debug&FF_DEBUG_MMCO)
3290             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3291
3292         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3293            || mmco[i].opcode == MMCO_SHORT2LONG){
3294             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3295             pic = find_short(h, frame_num, &j);
3296             if(!pic){
3297                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3298                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3299                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3300                 continue;
3301             }
3302         }
3303
3304         switch(mmco[i].opcode){
3305         case MMCO_SHORT2UNUSED:
3306             if(s->avctx->debug&FF_DEBUG_MMCO)
3307                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3308             remove_short(h, frame_num, structure ^ PICT_FRAME);
3309             break;
3310         case MMCO_SHORT2LONG:
3311                 if (h->long_ref[mmco[i].long_arg] != pic)
3312                     remove_long(h, mmco[i].long_arg, 0);
3313
3314                 remove_short_at_index(h, j);
3315                 h->long_ref[ mmco[i].long_arg ]= pic;
3316                 if (h->long_ref[ mmco[i].long_arg ]){
3317                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3318                     h->long_ref_count++;
3319                 }
3320             break;
3321         case MMCO_LONG2UNUSED:
3322             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3323             pic = h->long_ref[j];
3324             if (pic) {
3325                 remove_long(h, j, structure ^ PICT_FRAME);
3326             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3327                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3328             break;
3329         case MMCO_LONG:
3330                     // Comment below left from previous code as it is an interresting note.
3331                     /* First field in pair is in short term list or
3332                      * at a different long term index.
3333                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3334                      * Report the problem and keep the pair where it is,
3335                      * and mark this field valid.
3336                      */
3337
3338             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3339                 remove_long(h, mmco[i].long_arg, 0);
3340
3341                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3342                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3343                 h->long_ref_count++;
3344             }
3345
3346             s->current_picture_ptr->reference |= s->picture_structure;
3347             current_ref_assigned=1;
3348             break;
3349         case MMCO_SET_MAX_LONG:
3350             assert(mmco[i].long_arg <= 16);
3351             // just remove the long term which index is greater than new max
3352             for(j = mmco[i].long_arg; j<16; j++){
3353                 remove_long(h, j, 0);
3354             }
3355             break;
3356         case MMCO_RESET:
3357             while(h->short_ref_count){
3358                 remove_short(h, h->short_ref[0]->frame_num, 0);
3359             }
3360             for(j = 0; j < 16; j++) {
3361                 remove_long(h, j, 0);
3362             }
3363             s->current_picture_ptr->poc=
3364             s->current_picture_ptr->field_poc[0]=
3365             s->current_picture_ptr->field_poc[1]=
3366             h->poc_lsb=
3367             h->poc_msb=
3368             h->frame_num=
3369             s->current_picture_ptr->frame_num= 0;
3370             s->current_picture_ptr->mmco_reset=1;
3371             break;
3372         default: assert(0);
3373         }
3374     }
3375
3376     if (!current_ref_assigned) {
3377         /* Second field of complementary field pair; the first field of
3378          * which is already referenced. If short referenced, it
3379          * should be first entry in short_ref. If not, it must exist
3380          * in long_ref; trying to put it on the short list here is an
3381          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3382          */
3383         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3384             /* Just mark the second field valid */
3385             s->current_picture_ptr->reference = PICT_FRAME;
3386         } else if (s->current_picture_ptr->long_ref) {
3387             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3388                                              "assignment for second field "
3389                                              "in complementary field pair "
3390                                              "(first field is long term)\n");
3391         } else {
3392             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3393             if(pic){
3394                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3395             }
3396
3397             if(h->short_ref_count)
3398                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3399
3400             h->short_ref[0]= s->current_picture_ptr;
3401             h->short_ref_count++;
3402             s->current_picture_ptr->reference |= s->picture_structure;
3403         }
3404     }
3405
3406     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3407
3408         /* We have too many reference frames, probably due to corrupted
3409          * stream. Need to discard one frame. Prevents overrun of the
3410          * short_ref and long_ref buffers.
3411          */
3412         av_log(h->s.avctx, AV_LOG_ERROR,
3413                "number of reference frames exceeds max (probably "
3414                "corrupt input), discarding one\n");
3415
3416         if (h->long_ref_count && !h->short_ref_count) {
3417             for (i = 0; i < 16; ++i)
3418                 if (h->long_ref[i])
3419                     break;
3420
3421             assert(i < 16);
3422             remove_long(h, i, 0);
3423         } else {
3424             pic = h->short_ref[h->short_ref_count - 1];
3425             remove_short(h, pic->frame_num, 0);
3426         }
3427     }
3428
3429     print_short_term(h);
3430     print_long_term(h);
3431     return 0;
3432 }
3433
3434 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3435     MpegEncContext * const s = &h->s;
3436     int i;
3437
3438     h->mmco_index= 0;
3439     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3440         s->broken_link= get_bits1(gb) -1;
3441         if(get_bits1(gb)){
3442             h->mmco[0].opcode= MMCO_LONG;
3443             h->mmco[0].long_arg= 0;
3444             h->mmco_index= 1;
3445         }
3446     }else{
3447         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3448             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3449                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3450
3451                 h->mmco[i].opcode= opcode;
3452                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3453                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3454 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3455                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3456                         return -1;
3457                     }*/
3458                 }
3459                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3460                     unsigned int long_arg= get_ue_golomb_31(gb);
3461                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3462                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3463                         return -1;
3464                     }
3465                     h->mmco[i].long_arg= long_arg;
3466                 }
3467
3468                 if(opcode > (unsigned)MMCO_LONG){
3469                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3470                     return -1;
3471                 }
3472                 if(opcode == MMCO_END)
3473                     break;
3474             }
3475             h->mmco_index= i;
3476         }else{
3477             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3478
3479             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3480                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3481                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3482                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3483                 h->mmco_index= 1;
3484                 if (FIELD_PICTURE) {
3485                     h->mmco[0].short_pic_num *= 2;
3486                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3487                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3488                     h->mmco_index= 2;
3489                 }
3490             }
3491         }
3492     }
3493
3494     return 0;
3495 }
3496
3497 static int init_poc(H264Context *h){
3498     MpegEncContext * const s = &h->s;
3499     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3500     int field_poc[2];
3501     Picture *cur = s->current_picture_ptr;
3502
3503     h->frame_num_offset= h->prev_frame_num_offset;
3504     if(h->frame_num < h->prev_frame_num)
3505         h->frame_num_offset += max_frame_num;
3506
3507     if(h->sps.poc_type==0){
3508         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3509
3510         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3511             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3512         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3513             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3514         else
3515             h->poc_msb = h->prev_poc_msb;
3516 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3517         field_poc[0] =
3518         field_poc[1] = h->poc_msb + h->poc_lsb;
3519         if(s->picture_structure == PICT_FRAME)
3520             field_poc[1] += h->delta_poc_bottom;
3521     }else if(h->sps.poc_type==1){
3522         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3523         int i;
3524
3525         if(h->sps.poc_cycle_length != 0)
3526             abs_frame_num = h->frame_num_offset + h->frame_num;
3527         else
3528             abs_frame_num = 0;
3529
3530         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3531             abs_frame_num--;
3532
3533         expected_delta_per_poc_cycle = 0;
3534         for(i=0; i < h->sps.poc_cycle_length; i++)
3535             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3536
3537         if(abs_frame_num > 0){
3538             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3539             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3540
3541             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3542             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3543                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3544         } else
3545             expectedpoc = 0;
3546
3547         if(h->nal_ref_idc == 0)
3548             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3549
3550         field_poc[0] = expectedpoc + h->delta_poc[0];
3551         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3552
3553         if(s->picture_structure == PICT_FRAME)
3554             field_poc[1] += h->delta_poc[1];
3555     }else{
3556         int poc= 2*(h->frame_num_offset + h->frame_num);
3557
3558         if(!h->nal_ref_idc)
3559             poc--;
3560
3561         field_poc[0]= poc;
3562         field_poc[1]= poc;
3563     }
3564
3565     if(s->picture_structure != PICT_BOTTOM_FIELD)
3566         s->current_picture_ptr->field_poc[0]= field_poc[0];
3567     if(s->picture_structure != PICT_TOP_FIELD)
3568         s->current_picture_ptr->field_poc[1]= field_poc[1];
3569     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3570
3571     return 0;
3572 }
3573
3574
3575 /**
3576  * initialize scan tables
3577  */
3578 static void init_scan_tables(H264Context *h){
3579     MpegEncContext * const s = &h->s;
3580     int i;
3581     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3582         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3583         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3584     }else{
3585         for(i=0; i<16; i++){
3586 #define T(x) (x>>2) | ((x<<2) & 0xF)
3587             h->zigzag_scan[i] = T(zigzag_scan[i]);
3588             h-> field_scan[i] = T( field_scan[i]);
3589 #undef T
3590         }
3591     }
3592     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3593         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3594         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3595         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3596         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3597     }else{
3598         for(i=0; i<64; i++){
3599 #define T(x) (x>>3) | ((x&7)<<3)
3600             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3601             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3602             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3603             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3604 #undef T
3605         }
3606     }
3607     if(h->sps.transform_bypass){ //FIXME same ugly
3608         h->zigzag_scan_q0          = zigzag_scan;
3609         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3610         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3611         h->field_scan_q0           = field_scan;
3612         h->field_scan8x8_q0        = field_scan8x8;
3613         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3614     }else{
3615         h->zigzag_scan_q0          = h->zigzag_scan;
3616         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3617         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3618         h->field_scan_q0           = h->field_scan;
3619         h->field_scan8x8_q0        = h->field_scan8x8;
3620         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3621     }
3622 }
3623
3624 static void field_end(H264Context *h){
3625     MpegEncContext * const s = &h->s;
3626     AVCodecContext * const avctx= s->avctx;
3627     s->mb_y= 0;
3628
3629     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3630     s->current_picture_ptr->pict_type= s->pict_type;
3631
3632     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3633         ff_vdpau_h264_set_reference_frames(s);
3634
3635     if(!s->dropable) {
3636         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3637         h->prev_poc_msb= h->poc_msb;
3638         h->prev_poc_lsb= h->poc_lsb;
3639     }
3640     h->prev_frame_num_offset= h->frame_num_offset;
3641     h->prev_frame_num= h->frame_num;
3642
3643     if (avctx->hwaccel) {
3644         if (avctx->hwaccel->end_frame(avctx) < 0)
3645             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3646     }
3647
3648     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3649         ff_vdpau_h264_picture_complete(s);
3650
3651     /*
3652      * FIXME: Error handling code does not seem to support interlaced
3653      * when slices span multiple rows
3654      * The ff_er_add_slice calls don't work right for bottom
3655      * fields; they cause massive erroneous error concealing
3656      * Error marking covers both fields (top and bottom).
3657      * This causes a mismatched s->error_count
3658      * and a bad error table. Further, the error count goes to
3659      * INT_MAX when called for bottom field, because mb_y is
3660      * past end by one (callers fault) and resync_mb_y != 0
3661      * causes problems for the first MB line, too.
3662      */
3663     if (!FIELD_PICTURE)
3664         ff_er_frame_end(s);
3665
3666     MPV_frame_end(s);
3667
3668     h->current_slice=0;
3669 }
3670
3671 /**
3672  * Replicates H264 "master" context to thread contexts.
3673  */
3674 static void clone_slice(H264Context *dst, H264Context *src)
3675 {
3676     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3677     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3678     dst->s.current_picture      = src->s.current_picture;
3679     dst->s.linesize             = src->s.linesize;
3680     dst->s.uvlinesize           = src->s.uvlinesize;
3681     dst->s.first_field          = src->s.first_field;
3682
3683     dst->prev_poc_msb           = src->prev_poc_msb;
3684     dst->prev_poc_lsb           = src->prev_poc_lsb;
3685     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3686     dst->prev_frame_num         = src->prev_frame_num;
3687     dst->short_ref_count        = src->short_ref_count;
3688
3689     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3690     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3691     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3692     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3693
3694     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3695     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3696 }
3697
3698 /**
3699  * decodes a slice header.
3700  * This will also call MPV_common_init() and frame_start() as needed.
3701  *
3702  * @param h h264context
3703  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3704  *
3705  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3706  */
3707 static int decode_slice_header(H264Context *h, H264Context *h0){
3708     MpegEncContext * const s = &h->s;
3709     MpegEncContext * const s0 = &h0->s;
3710     unsigned int first_mb_in_slice;
3711     unsigned int pps_id;
3712     int num_ref_idx_active_override_flag;
3713     unsigned int slice_type, tmp, i, j;
3714     int default_ref_list_done = 0;
3715     int last_pic_structure;
3716
3717     s->dropable= h->nal_ref_idc == 0;
3718
3719     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3720         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3721         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3722     }else{
3723         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3724         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3725     }
3726
3727     first_mb_in_slice= get_ue_golomb(&s->gb);
3728
3729     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3730         if(h0->current_slice && FIELD_PICTURE){
3731             field_end(h);
3732         }
3733
3734         h0->current_slice = 0;
3735         if (!s0->first_field)
3736             s->current_picture_ptr= NULL;
3737     }
3738
3739     slice_type= get_ue_golomb_31(&s->gb);
3740     if(slice_type > 9){
3741         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3742         return -1;
3743     }
3744     if(slice_type > 4){
3745         slice_type -= 5;
3746         h->slice_type_fixed=1;
3747     }else
3748         h->slice_type_fixed=0;
3749
3750     slice_type= golomb_to_pict_type[ slice_type ];
3751     if (slice_type == FF_I_TYPE
3752         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3753         default_ref_list_done = 1;
3754     }
3755     h->slice_type= slice_type;
3756     h->slice_type_nos= slice_type & 3;
3757
3758     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3759     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3760         av_log(h->s.avctx, AV_LOG_ERROR,
3761                "B picture before any references, skipping\n");
3762         return -1;
3763     }
3764
3765     pps_id= get_ue_golomb(&s->gb);
3766     if(pps_id>=MAX_PPS_COUNT){
3767         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3768         return -1;
3769     }
3770     if(!h0->pps_buffers[pps_id]) {
3771         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3772         return -1;
3773     }
3774     h->pps= *h0->pps_buffers[pps_id];
3775
3776     if(!h0->sps_buffers[h->pps.sps_id]) {
3777         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3778         return -1;
3779     }
3780     h->sps = *h0->sps_buffers[h->pps.sps_id];
3781
3782     if(h == h0 && h->dequant_coeff_pps != pps_id){
3783         h->dequant_coeff_pps = pps_id;
3784         init_dequant_tables(h);
3785     }
3786
3787     s->mb_width= h->sps.mb_width;
3788     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3789
3790     h->b_stride=  s->mb_width*4;
3791     h->b8_stride= s->mb_width*2;
3792
3793     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3794     if(h->sps.frame_mbs_only_flag)
3795         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3796     else
3797         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3798
3799     if (s->context_initialized
3800         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3801         if(h != h0)
3802             return -1;   // width / height changed during parallelized decoding
3803         free_tables(h);
3804         flush_dpb(s->avctx);
3805         MPV_common_end(s);
3806     }
3807     if (!s->context_initialized) {
3808         if(h != h0)
3809             return -1;  // we cant (re-)initialize context during parallel decoding
3810         if (MPV_common_init(s) < 0)
3811             return -1;
3812         s->first_field = 0;
3813         h->prev_interlaced_frame = 1;
3814
3815         init_scan_tables(h);
3816         alloc_tables(h);
3817
3818         for(i = 1; i < s->avctx->thread_count; i++) {
3819             H264Context *c;
3820             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3821             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3822             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3823             c->sps = h->sps;
3824             c->pps = h->pps;
3825             init_scan_tables(c);
3826             clone_tables(c, h);
3827         }
3828
3829         for(i = 0; i < s->avctx->thread_count; i++)
3830             if(context_init(h->thread_context[i]) < 0)
3831                 return -1;
3832
3833         s->avctx->width = s->width;
3834         s->avctx->height = s->height;
3835         s->avctx->sample_aspect_ratio= h->sps.sar;
3836         if(!s->avctx->sample_aspect_ratio.den)
3837             s->avctx->sample_aspect_ratio.den = 1;
3838
3839         if(h->sps.timing_info_present_flag){
3840             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3841             if(h->x264_build > 0 && h->x264_build < 44)
3842                 s->avctx->time_base.den *= 2;
3843             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3844                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3845         }
3846     }
3847
3848     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3849
3850     h->mb_mbaff = 0;
3851     h->mb_aff_frame = 0;
3852     last_pic_structure = s0->picture_structure;
3853     if(h->sps.frame_mbs_only_flag){
3854         s->picture_structure= PICT_FRAME;
3855     }else{
3856         if(get_bits1(&s->gb)) { //field_pic_flag
3857             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3858         } else {
3859             s->picture_structure= PICT_FRAME;
3860             h->mb_aff_frame = h->sps.mb_aff;
3861         }
3862     }
3863     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3864
3865     if(h0->current_slice == 0){
3866         while(h->frame_num !=  h->prev_frame_num &&
3867               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3868             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3869             if (frame_start(h) < 0)
3870                 return -1;
3871             h->prev_frame_num++;
3872             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3873             s->current_picture_ptr->frame_num= h->prev_frame_num;
3874             execute_ref_pic_marking(h, NULL, 0);
3875         }
3876
3877         /* See if we have a decoded first field looking for a pair... */
3878         if (s0->first_field) {
3879             assert(s0->current_picture_ptr);
3880             assert(s0->current_picture_ptr->data[0]);
3881             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3882
3883             /* figure out if we have a complementary field pair */
3884             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3885                 /*
3886                  * Previous field is unmatched. Don't display it, but let it
3887                  * remain for reference if marked as such.
3888                  */
3889                 s0->current_picture_ptr = NULL;
3890                 s0->first_field = FIELD_PICTURE;
3891
3892             } else {
3893                 if (h->nal_ref_idc &&
3894                         s0->current_picture_ptr->reference &&
3895                         s0->current_picture_ptr->frame_num != h->frame_num) {
3896                     /*
3897                      * This and previous field were reference, but had
3898                      * different frame_nums. Consider this field first in
3899                      * pair. Throw away previous field except for reference
3900                      * purposes.
3901                      */
3902                     s0->first_field = 1;
3903                     s0->current_picture_ptr = NULL;
3904
3905                 } else {
3906                     /* Second field in complementary pair */
3907                     s0->first_field = 0;
3908                 }
3909             }
3910
3911         } else {
3912             /* Frame or first field in a potentially complementary pair */
3913             assert(!s0->current_picture_ptr);
3914             s0->first_field = FIELD_PICTURE;
3915         }
3916
3917         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3918             s0->first_field = 0;
3919             return -1;
3920         }
3921     }
3922     if(h != h0)
3923         clone_slice(h, h0);
3924
3925     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3926
3927     assert(s->mb_num == s->mb_width * s->mb_height);
3928     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3929        first_mb_in_slice                    >= s->mb_num){
3930         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3931         return -1;
3932     }
3933     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3934     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3935     if (s->picture_structure == PICT_BOTTOM_FIELD)
3936         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3937     assert(s->mb_y < s->mb_height);
3938
3939     if(s->picture_structure==PICT_FRAME){
3940         h->curr_pic_num=   h->frame_num;
3941         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3942     }else{
3943         h->curr_pic_num= 2*h->frame_num + 1;
3944         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3945     }
3946
3947     if(h->nal_unit_type == NAL_IDR_SLICE){
3948         get_ue_golomb(&s->gb); /* idr_pic_id */
3949     }
3950
3951     if(h->sps.poc_type==0){
3952         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3953
3954         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3955             h->delta_poc_bottom= get_se_golomb(&s->gb);
3956         }
3957     }
3958
3959     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3960         h->delta_poc[0]= get_se_golomb(&s->gb);
3961
3962         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3963             h->delta_poc[1]= get_se_golomb(&s->gb);
3964     }
3965
3966     init_poc(h);
3967
3968     if(h->pps.redundant_pic_cnt_present){
3969         h->redundant_pic_count= get_ue_golomb(&s->gb);
3970     }
3971
3972     //set defaults, might be overridden a few lines later
3973     h->ref_count[0]= h->pps.ref_count[0];
3974     h->ref_count[1]= h->pps.ref_count[1];
3975
3976     if(h->slice_type_nos != FF_I_TYPE){
3977         if(h->slice_type_nos == FF_B_TYPE){
3978             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3979         }
3980         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3981
3982         if(num_ref_idx_active_override_flag){
3983             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3984             if(h->slice_type_nos==FF_B_TYPE)
3985                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3986
3987             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3988                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3989                 h->ref_count[0]= h->ref_count[1]= 1;
3990                 return -1;
3991             }
3992         }
3993         if(h->slice_type_nos == FF_B_TYPE)
3994             h->list_count= 2;
3995         else
3996             h->list_count= 1;
3997     }else
3998         h->list_count= 0;
3999
4000     if(!default_ref_list_done){
4001         fill_default_ref_list(h);
4002     }
4003
4004     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4005         return -1;
4006
4007     if(h->slice_type_nos!=FF_I_TYPE){
4008         s->last_picture_ptr= &h->ref_list[0][0];
4009         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4010     }
4011     if(h->slice_type_nos==FF_B_TYPE){
4012         s->next_picture_ptr= &h->ref_list[1][0];
4013         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4014     }
4015
4016     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4017        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4018         pred_weight_table(h);
4019     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4020         implicit_weight_table(h);
4021     else {
4022         h->use_weight = 0;
4023         for (i = 0; i < 2; i++) {
4024             h->luma_weight_flag[i]   = 0;
4025             h->chroma_weight_flag[i] = 0;
4026         }
4027     }
4028
4029     if(h->nal_ref_idc)
4030         decode_ref_pic_marking(h0, &s->gb);
4031
4032     if(FRAME_MBAFF)
4033         fill_mbaff_ref_list(h);
4034
4035     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4036         direct_dist_scale_factor(h);
4037     direct_ref_list_init(h);
4038
4039     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4040         tmp = get_ue_golomb_31(&s->gb);
4041         if(tmp > 2){
4042             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4043             return -1;
4044         }
4045         h->cabac_init_idc= tmp;
4046     }
4047
4048     h->last_qscale_diff = 0;
4049     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4050     if(tmp>51){
4051         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4052         return -1;
4053     }
4054     s->qscale= tmp;
4055     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4056     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4057     //FIXME qscale / qp ... stuff
4058     if(h->slice_type == FF_SP_TYPE){
4059         get_bits1(&s->gb); /* sp_for_switch_flag */
4060     }
4061     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4062         get_se_golomb(&s->gb); /* slice_qs_delta */
4063     }
4064
4065     h->deblocking_filter = 1;
4066     h->slice_alpha_c0_offset = 0;
4067     h->slice_beta_offset = 0;
4068     if( h->pps.deblocking_filter_parameters_present ) {
4069         tmp= get_ue_golomb_31(&s->gb);
4070         if(tmp > 2){
4071             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4072             return -1;
4073         }
4074         h->deblocking_filter= tmp;
4075         if(h->deblocking_filter < 2)
4076             h->deblocking_filter^= 1; // 1<->0
4077
4078         if( h->deblocking_filter ) {
4079             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4080             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4081         }
4082     }
4083
4084     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4085        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4086        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4087        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4088         h->deblocking_filter= 0;
4089
4090     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4091         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4092             /* Cheat slightly for speed:
4093                Do not bother to deblock across slices. */
4094             h->deblocking_filter = 2;
4095         } else {
4096             h0->max_contexts = 1;
4097             if(!h0->single_decode_warning) {
4098                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4099                 h0->single_decode_warning = 1;
4100             }
4101             if(h != h0)
4102                 return 1; // deblocking switched inside frame
4103         }
4104     }
4105
4106 #if 0 //FMO
4107     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4108         slice_group_change_cycle= get_bits(&s->gb, ?);
4109 #endif
4110
4111     h0->last_slice_type = slice_type;
4112     h->slice_num = ++h0->current_slice;
4113     if(h->slice_num >= MAX_SLICES){
4114         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4115     }
4116
4117     for(j=0; j<2; j++){
4118         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4119         ref2frm[0]=
4120         ref2frm[1]= -1;
4121         for(i=0; i<16; i++)
4122             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4123                           +(h->ref_list[j][i].reference&3);
4124         ref2frm[18+0]=
4125         ref2frm[18+1]= -1;
4126         for(i=16; i<48; i++)
4127             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4128                           +(h->ref_list[j][i].reference&3);
4129     }
4130
4131     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4132     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4133
4134     s->avctx->refs= h->sps.ref_frame_count;
4135
4136     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4137         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4138                h->slice_num,
4139                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4140                first_mb_in_slice,
4141                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4142                pps_id, h->frame_num,
4143                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4144                h->ref_count[0], h->ref_count[1],
4145                s->qscale,
4146                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4147                h->use_weight,
4148                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4149                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4150                );
4151     }
4152
4153     return 0;
4154 }
4155
4156 /**
4157  *
4158  */
4159 static inline int get_level_prefix(GetBitContext *gb){
4160     unsigned int buf;
4161     int log;
4162
4163     OPEN_READER(re, gb);
4164     UPDATE_CACHE(re, gb);
4165     buf=GET_CACHE(re, gb);
4166
4167     log= 32 - av_log2(buf);
4168 #ifdef TRACE
4169     print_bin(buf>>(32-log), log);
4170     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4171 #endif
4172
4173     LAST_SKIP_BITS(re, gb, log);
4174     CLOSE_READER(re, gb);
4175
4176     return log-1;
4177 }
4178
4179 static inline int get_dct8x8_allowed(H264Context *h){
4180     if(h->sps.direct_8x8_inference_flag)
4181         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4182     else
4183         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4184 }
4185
4186 /**
4187  * decodes a residual block.
4188  * @param n block index
4189  * @param scantable scantable
4190  * @param max_coeff number of coefficients in the block
4191  * @return <0 if an error occurred
4192  */
4193 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4194     MpegEncContext * const s = &h->s;
4195     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4196     int level[16];
4197     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4198
4199     //FIXME put trailing_onex into the context
4200
4201     if(n == CHROMA_DC_BLOCK_INDEX){
4202         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4203         total_coeff= coeff_token>>2;
4204     }else{
4205         if(n == LUMA_DC_BLOCK_INDEX){
4206             total_coeff= pred_non_zero_count(h, 0);
4207             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4208             total_coeff= coeff_token>>2;
4209         }else{
4210             total_coeff= pred_non_zero_count(h, n);
4211             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4212             total_coeff= coeff_token>>2;
4213             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4214         }
4215     }
4216
4217     //FIXME set last_non_zero?
4218
4219     if(total_coeff==0)
4220         return 0;
4221     if(total_coeff > (unsigned)max_coeff) {
4222         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4223         return -1;
4224     }
4225
4226     trailing_ones= coeff_token&3;
4227     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4228     assert(total_coeff<=16);
4229
4230     i = show_bits(gb, 3);
4231     skip_bits(gb, trailing_ones);
4232     level[0] = 1-((i&4)>>1);
4233     level[1] = 1-((i&2)   );
4234     level[2] = 1-((i&1)<<1);
4235
4236     if(trailing_ones<total_coeff) {
4237         int mask, prefix;
4238         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4239         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4240         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4241
4242         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4243         if(level_code >= 100){
4244             prefix= level_code - 100;
4245             if(prefix == LEVEL_TAB_BITS)
4246                 prefix += get_level_prefix(gb);
4247
4248             //first coefficient has suffix_length equal to 0 or 1
4249             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4250                 if(suffix_length)
4251                     level_code= (prefix<<1) + get_bits1(gb); //part
4252                 else
4253                     level_code= prefix; //part
4254             }else if(prefix==14){
4255                 if(suffix_length)
4256                     level_code= (prefix<<1) + get_bits1(gb); //part
4257                 else
4258                     level_code= prefix + get_bits(gb, 4); //part
4259             }else{
4260                 level_code= 30 + get_bits(gb, prefix-3); //part
4261                 if(prefix>=16)
4262                     level_code += (1<<(prefix-3))-4096;
4263             }
4264
4265             if(trailing_ones < 3) level_code += 2;
4266
4267             suffix_length = 2;
4268             mask= -(level_code&1);
4269             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4270         }else{
4271             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4272
4273             suffix_length = 1;
4274             if(level_code + 3U > 6U)
4275                 suffix_length++;
4276             level[trailing_ones]= level_code;
4277         }
4278
4279         //remaining coefficients have suffix_length > 0
4280         for(i=trailing_ones+1;i<total_coeff;i++) {
4281             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4282             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4283             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4284
4285             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4286             if(level_code >= 100){
4287                 prefix= level_code - 100;
4288                 if(prefix == LEVEL_TAB_BITS){
4289                     prefix += get_level_prefix(gb);
4290                 }
4291                 if(prefix<15){
4292                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4293                 }else{
4294                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4295                     if(prefix>=16)
4296                         level_code += (1<<(prefix-3))-4096;
4297                 }
4298                 mask= -(level_code&1);
4299                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4300             }
4301             level[i]= level_code;
4302
4303             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4304                 suffix_length++;
4305         }
4306     }
4307
4308     if(total_coeff == max_coeff)
4309         zeros_left=0;
4310     else{
4311         if(n == CHROMA_DC_BLOCK_INDEX)
4312             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4313         else
4314             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4315     }
4316
4317     coeff_num = zeros_left + total_coeff - 1;
4318     j = scantable[coeff_num];
4319     if(n > 24){
4320         block[j] = level[0];
4321         for(i=1;i<total_coeff;i++) {
4322             if(zeros_left <= 0)
4323                 run_before = 0;
4324             else if(zeros_left < 7){
4325                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4326             }else{
4327                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4328             }
4329             zeros_left -= run_before;
4330             coeff_num -= 1 + run_before;
4331             j= scantable[ coeff_num ];
4332
4333             block[j]= level[i];
4334         }
4335     }else{
4336         block[j] = (level[0] * qmul[j] + 32)>>6;
4337         for(i=1;i<total_coeff;i++) {
4338             if(zeros_left <= 0)
4339                 run_before = 0;
4340             else if(zeros_left < 7){
4341                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4342             }else{
4343                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4344             }
4345             zeros_left -= run_before;
4346             coeff_num -= 1 + run_before;
4347             j= scantable[ coeff_num ];
4348
4349             block[j]= (level[i] * qmul[j] + 32)>>6;
4350         }
4351     }
4352
4353     if(zeros_left<0){
4354         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4355         return -1;
4356     }
4357
4358     return 0;
4359 }
4360
4361 static void predict_field_decoding_flag(H264Context *h){
4362     MpegEncContext * const s = &h->s;
4363     const int mb_xy= h->mb_xy;
4364     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4365                 ? s->current_picture.mb_type[mb_xy-1]
4366                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4367                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4368                 : 0;
4369     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4370 }
4371
4372 /**
4373  * decodes a P_SKIP or B_SKIP macroblock
4374  */
4375 static void decode_mb_skip(H264Context *h){
4376     MpegEncContext * const s = &h->s;
4377     const int mb_xy= h->mb_xy;
4378     int mb_type=0;
4379
4380     memset(h->non_zero_count[mb_xy], 0, 16);
4381     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4382
4383     if(MB_FIELD)
4384         mb_type|= MB_TYPE_INTERLACED;
4385
4386     if( h->slice_type_nos == FF_B_TYPE )
4387     {
4388         // just for fill_caches. pred_direct_motion will set the real mb_type
4389         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4390
4391         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4392         pred_direct_motion(h, &mb_type);
4393         mb_type|= MB_TYPE_SKIP;
4394     }
4395     else
4396     {
4397         int mx, my;
4398         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4399
4400         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4401         pred_pskip_motion(h, &mx, &my);
4402         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4403         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4404     }
4405
4406     write_back_motion(h, mb_type);
4407     s->current_picture.mb_type[mb_xy]= mb_type;
4408     s->current_picture.qscale_table[mb_xy]= s->qscale;
4409     h->slice_table[ mb_xy ]= h->slice_num;
4410     h->prev_mb_skipped= 1;
4411 }
4412
4413 /**
4414  * decodes a macroblock
4415  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4416  */
4417 static int decode_mb_cavlc(H264Context *h){
4418     MpegEncContext * const s = &h->s;
4419     int mb_xy;
4420     int partition_count;
4421     unsigned int mb_type, cbp;
4422     int dct8x8_allowed= h->pps.transform_8x8_mode;
4423
4424     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4425
4426     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4427     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4428                 down the code */
4429     if(h->slice_type_nos != FF_I_TYPE){
4430         if(s->mb_skip_run==-1)
4431             s->mb_skip_run= get_ue_golomb(&s->gb);
4432
4433         if (s->mb_skip_run--) {
4434             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4435                 if(s->mb_skip_run==0)
4436                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4437                 else
4438                     predict_field_decoding_flag(h);
4439             }
4440             decode_mb_skip(h);
4441             return 0;
4442         }
4443     }
4444     if(FRAME_MBAFF){
4445         if( (s->mb_y&1) == 0 )
4446             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4447     }
4448
4449     h->prev_mb_skipped= 0;
4450
4451     mb_type= get_ue_golomb(&s->gb);
4452     if(h->slice_type_nos == FF_B_TYPE){
4453         if(mb_type < 23){
4454             partition_count= b_mb_type_info[mb_type].partition_count;
4455             mb_type=         b_mb_type_info[mb_type].type;
4456         }else{
4457             mb_type -= 23;
4458             goto decode_intra_mb;
4459         }
4460     }else if(h->slice_type_nos == FF_P_TYPE){
4461         if(mb_type < 5){
4462             partition_count= p_mb_type_info[mb_type].partition_count;
4463             mb_type=         p_mb_type_info[mb_type].type;
4464         }else{
4465             mb_type -= 5;
4466             goto decode_intra_mb;
4467         }
4468     }else{
4469        assert(h->slice_type_nos == FF_I_TYPE);
4470         if(h->slice_type == FF_SI_TYPE && mb_type)
4471             mb_type--;
4472 decode_intra_mb:
4473         if(mb_type > 25){
4474             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4475             return -1;
4476         }
4477         partition_count=0;
4478         cbp= i_mb_type_info[mb_type].cbp;
4479         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4480         mb_type= i_mb_type_info[mb_type].type;
4481     }
4482
4483     if(MB_FIELD)
4484         mb_type |= MB_TYPE_INTERLACED;
4485
4486     h->slice_table[ mb_xy ]= h->slice_num;
4487
4488     if(IS_INTRA_PCM(mb_type)){
4489         unsigned int x;
4490
4491         // We assume these blocks are very rare so we do not optimize it.
4492         align_get_bits(&s->gb);
4493
4494         // The pixels are stored in the same order as levels in h->mb array.
4495         for(x=0; x < (CHROMA ? 384 : 256); x++){
4496             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4497         }
4498
4499         // In deblocking, the quantizer is 0
4500         s->current_picture.qscale_table[mb_xy]= 0;
4501         // All coeffs are present
4502         memset(h->non_zero_count[mb_xy], 16, 16);
4503
4504         s->current_picture.mb_type[mb_xy]= mb_type;
4505         return 0;
4506     }
4507
4508     if(MB_MBAFF){
4509         h->ref_count[0] <<= 1;
4510         h->ref_count[1] <<= 1;
4511     }
4512
4513     fill_caches(h, mb_type, 0);
4514
4515     //mb_pred
4516     if(IS_INTRA(mb_type)){
4517         int pred_mode;
4518 //            init_top_left_availability(h);
4519         if(IS_INTRA4x4(mb_type)){
4520             int i;
4521             int di = 1;
4522             if(dct8x8_allowed && get_bits1(&s->gb)){
4523                 mb_type |= MB_TYPE_8x8DCT;
4524                 di = 4;
4525             }
4526
4527 //                fill_intra4x4_pred_table(h);
4528             for(i=0; i<16; i+=di){
4529                 int mode= pred_intra_mode(h, i);
4530
4531                 if(!get_bits1(&s->gb)){
4532                     const int rem_mode= get_bits(&s->gb, 3);
4533                     mode = rem_mode + (rem_mode >= mode);
4534                 }
4535
4536                 if(di==4)
4537                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4538                 else
4539                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4540             }
4541             write_back_intra_pred_mode(h);
4542             if( check_intra4x4_pred_mode(h) < 0)
4543                 return -1;
4544         }else{
4545             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4546             if(h->intra16x16_pred_mode < 0)
4547                 return -1;
4548         }
4549         if(CHROMA){
4550             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4551             if(pred_mode < 0)
4552                 return -1;
4553             h->chroma_pred_mode= pred_mode;
4554         }
4555     }else if(partition_count==4){
4556         int i, j, sub_partition_count[4], list, ref[2][4];
4557
4558         if(h->slice_type_nos == FF_B_TYPE){
4559             for(i=0; i<4; i++){
4560                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4561                 if(h->sub_mb_type[i] >=13){
4562                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4563                     return -1;
4564                 }
4565                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4566                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4567             }
4568             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4569                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4570                 pred_direct_motion(h, &mb_type);
4571                 h->ref_cache[0][scan8[4]] =
4572                 h->ref_cache[1][scan8[4]] =
4573                 h->ref_cache[0][scan8[12]] =
4574                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4575             }
4576         }else{
4577             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4578             for(i=0; i<4; i++){
4579                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4580                 if(h->sub_mb_type[i] >=4){
4581                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4582                     return -1;
4583                 }
4584                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4585                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4586             }
4587         }
4588
4589         for(list=0; list<h->list_count; list++){
4590             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4591             for(i=0; i<4; i++){
4592                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4593                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4594                     unsigned int tmp;
4595                     if(ref_count == 1){
4596                         tmp= 0;
4597                     }else if(ref_count == 2){
4598                         tmp= get_bits1(&s->gb)^1;
4599                     }else{
4600                         tmp= get_ue_golomb_31(&s->gb);
4601                         if(tmp>=ref_count){
4602                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4603                             return -1;
4604                         }
4605                     }
4606                     ref[list][i]= tmp;
4607                 }else{
4608                  //FIXME
4609                     ref[list][i] = -1;
4610                 }
4611             }
4612         }
4613
4614         if(dct8x8_allowed)
4615             dct8x8_allowed = get_dct8x8_allowed(h);
4616
4617         for(list=0; list<h->list_count; list++){
4618             for(i=0; i<4; i++){
4619                 if(IS_DIRECT(h->sub_mb_type[i])) {
4620                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4621                     continue;
4622                 }
4623                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4624                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4625
4626                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4627                     const int sub_mb_type= h->sub_mb_type[i];
4628                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4629                     for(j=0; j<sub_partition_count[i]; j++){
4630                         int mx, my;
4631                         const int index= 4*i + block_width*j;
4632                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4633                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4634                         mx += get_se_golomb(&s->gb);
4635                         my += get_se_golomb(&s->gb);
4636                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4637
4638                         if(IS_SUB_8X8(sub_mb_type)){
4639                             mv_cache[ 1 ][0]=
4640                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4641                             mv_cache[ 1 ][1]=
4642                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4643                         }else if(IS_SUB_8X4(sub_mb_type)){
4644                             mv_cache[ 1 ][0]= mx;
4645                             mv_cache[ 1 ][1]= my;
4646                         }else if(IS_SUB_4X8(sub_mb_type)){
4647                             mv_cache[ 8 ][0]= mx;
4648                             mv_cache[ 8 ][1]= my;
4649                         }
4650                         mv_cache[ 0 ][0]= mx;
4651                         mv_cache[ 0 ][1]= my;
4652                     }
4653                 }else{
4654                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4655                     p[0] = p[1]=
4656                     p[8] = p[9]= 0;
4657                 }
4658             }
4659         }
4660     }else if(IS_DIRECT(mb_type)){
4661         pred_direct_motion(h, &mb_type);
4662         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4663     }else{
4664         int list, mx, my, i;
4665          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4666         if(IS_16X16(mb_type)){
4667             for(list=0; list<h->list_count; list++){
4668                     unsigned int val;
4669                     if(IS_DIR(mb_type, 0, list)){
4670                         if(h->ref_count[list]==1){
4671                             val= 0;
4672                         }else if(h->ref_count[list]==2){
4673                             val= get_bits1(&s->gb)^1;
4674                         }else{
4675                             val= get_ue_golomb_31(&s->gb);
4676                             if(val >= h->ref_count[list]){
4677                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4678                                 return -1;
4679                             }
4680                         }
4681                     }else
4682                         val= LIST_NOT_USED&0xFF;
4683                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4684             }
4685             for(list=0; list<h->list_count; list++){
4686                 unsigned int val;
4687                 if(IS_DIR(mb_type, 0, list)){
4688                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4689                     mx += get_se_golomb(&s->gb);
4690                     my += get_se_golomb(&s->gb);
4691                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4692
4693                     val= pack16to32(mx,my);
4694                 }else
4695                     val=0;
4696                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4697             }
4698         }
4699         else if(IS_16X8(mb_type)){
4700             for(list=0; list<h->list_count; list++){
4701                     for(i=0; i<2; i++){
4702                         unsigned int val;
4703                         if(IS_DIR(mb_type, i, list)){
4704                             if(h->ref_count[list] == 1){
4705                                 val= 0;
4706                             }else if(h->ref_count[list] == 2){
4707                                 val= get_bits1(&s->gb)^1;
4708                             }else{
4709                                 val= get_ue_golomb_31(&s->gb);
4710                                 if(val >= h->ref_count[list]){
4711                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4712                                     return -1;
4713                                 }
4714                             }
4715                         }else
4716                             val= LIST_NOT_USED&0xFF;
4717                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4718                     }
4719             }
4720             for(list=0; list<h->list_count; list++){
4721                 for(i=0; i<2; i++){
4722                     unsigned int val;
4723                     if(IS_DIR(mb_type, i, list)){
4724                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4725                         mx += get_se_golomb(&s->gb);
4726                         my += get_se_golomb(&s->gb);
4727                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4728
4729                         val= pack16to32(mx,my);
4730                     }else
4731                         val=0;
4732                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4733                 }
4734             }
4735         }else{
4736             assert(IS_8X16(mb_type));
4737             for(list=0; list<h->list_count; list++){
4738                     for(i=0; i<2; i++){
4739                         unsigned int val;
4740                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4741                             if(h->ref_count[list]==1){
4742                                 val= 0;
4743                             }else if(h->ref_count[list]==2){
4744                                 val= get_bits1(&s->gb)^1;
4745                             }else{
4746                                 val= get_ue_golomb_31(&s->gb);
4747                                 if(val >= h->ref_count[list]){
4748                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4749                                     return -1;
4750                                 }
4751                             }
4752                         }else
4753                             val= LIST_NOT_USED&0xFF;
4754                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4755                     }
4756             }
4757             for(list=0; list<h->list_count; list++){
4758                 for(i=0; i<2; i++){
4759                     unsigned int val;
4760                     if(IS_DIR(mb_type, i, list)){
4761                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4762                         mx += get_se_golomb(&s->gb);
4763                         my += get_se_golomb(&s->gb);
4764                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4765
4766                         val= pack16to32(mx,my);
4767                     }else
4768                         val=0;
4769                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4770                 }
4771             }
4772         }
4773     }
4774
4775     if(IS_INTER(mb_type))
4776         write_back_motion(h, mb_type);
4777
4778     if(!IS_INTRA16x16(mb_type)){
4779         cbp= get_ue_golomb(&s->gb);
4780         if(cbp > 47){
4781             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4782             return -1;
4783         }
4784
4785         if(CHROMA){
4786             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4787             else                     cbp= golomb_to_inter_cbp   [cbp];
4788         }else{
4789             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4790             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4791         }
4792     }
4793     h->cbp = cbp;
4794
4795     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4796         if(get_bits1(&s->gb)){
4797             mb_type |= MB_TYPE_8x8DCT;
4798             h->cbp_table[mb_xy]= cbp;
4799         }
4800     }
4801     s->current_picture.mb_type[mb_xy]= mb_type;
4802
4803     if(cbp || IS_INTRA16x16(mb_type)){
4804         int i8x8, i4x4, chroma_idx;
4805         int dquant;
4806         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4807         const uint8_t *scan, *scan8x8, *dc_scan;
4808
4809 //        fill_non_zero_count_cache(h);
4810
4811         if(IS_INTERLACED(mb_type)){
4812             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4813             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4814             dc_scan= luma_dc_field_scan;
4815         }else{
4816             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4817             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4818             dc_scan= luma_dc_zigzag_scan;
4819         }
4820
4821         dquant= get_se_golomb(&s->gb);
4822
4823         if( dquant > 25 || dquant < -26 ){
4824             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4825             return -1;
4826         }
4827
4828         s->qscale += dquant;
4829         if(((unsigned)s->qscale) > 51){
4830             if(s->qscale<0) s->qscale+= 52;
4831             else            s->qscale-= 52;
4832         }
4833
4834         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4835         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4836         if(IS_INTRA16x16(mb_type)){
4837             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4838                 return -1; //FIXME continue if partitioned and other return -1 too
4839             }
4840
4841             assert((cbp&15) == 0 || (cbp&15) == 15);
4842
4843             if(cbp&15){
4844                 for(i8x8=0; i8x8<4; i8x8++){
4845                     for(i4x4=0; i4x4<4; i4x4++){
4846                         const int index= i4x4 + 4*i8x8;
4847                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4848                             return -1;
4849                         }
4850                     }
4851                 }
4852             }else{
4853                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4854             }
4855         }else{
4856             for(i8x8=0; i8x8<4; i8x8++){
4857                 if(cbp & (1<<i8x8)){
4858                     if(IS_8x8DCT(mb_type)){
4859                         DCTELEM *buf = &h->mb[64*i8x8];
4860                         uint8_t *nnz;
4861                         for(i4x4=0; i4x4<4; i4x4++){
4862                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4863                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4864                                 return -1;
4865                         }
4866                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4867                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4868                     }else{
4869                         for(i4x4=0; i4x4<4; i4x4++){
4870                             const int index= i4x4 + 4*i8x8;
4871
4872                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4873                                 return -1;
4874                             }
4875                         }
4876                     }
4877                 }else{
4878                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4879                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4880                 }
4881             }
4882         }
4883
4884         if(cbp&0x30){
4885             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4886                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4887                     return -1;
4888                 }
4889         }
4890
4891         if(cbp&0x20){
4892             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4893                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4894                 for(i4x4=0; i4x4<4; i4x4++){
4895                     const int index= 16 + 4*chroma_idx + i4x4;
4896                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4897                         return -1;
4898                     }
4899                 }
4900             }
4901         }else{
4902             uint8_t * const nnz= &h->non_zero_count_cache[0];
4903             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4904             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4905         }
4906     }else{
4907         uint8_t * const nnz= &h->non_zero_count_cache[0];
4908         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4909         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4910         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4911     }
4912     s->current_picture.qscale_table[mb_xy]= s->qscale;
4913     write_back_non_zero_count(h);
4914
4915     if(MB_MBAFF){
4916         h->ref_count[0] >>= 1;
4917         h->ref_count[1] >>= 1;
4918     }
4919
4920     return 0;
4921 }
4922
4923 static int decode_cabac_field_decoding_flag(H264Context *h) {
4924     MpegEncContext * const s = &h->s;
4925     const int mb_x = s->mb_x;
4926     const int mb_y = s->mb_y & ~1;
4927     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4928     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4929
4930     unsigned int ctx = 0;
4931
4932     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4933         ctx += 1;
4934     }
4935     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4936         ctx += 1;
4937     }
4938
4939     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4940 }
4941
4942 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4943     uint8_t *state= &h->cabac_state[ctx_base];
4944     int mb_type;
4945
4946     if(intra_slice){
4947         MpegEncContext * const s = &h->s;
4948         const int mba_xy = h->left_mb_xy[0];
4949         const int mbb_xy = h->top_mb_xy;
4950         int ctx=0;
4951         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4952             ctx++;
4953         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4954             ctx++;
4955         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4956             return 0;   /* I4x4 */
4957         state += 2;
4958     }else{
4959         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4960             return 0;   /* I4x4 */
4961     }
4962
4963     if( get_cabac_terminate( &h->cabac ) )
4964         return 25;  /* PCM */
4965
4966     mb_type = 1; /* I16x16 */
4967     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4968     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4969         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4970     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4971     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4972     return mb_type;
4973 }
4974
4975 static int decode_cabac_mb_type_b( H264Context *h ) {
4976     MpegEncContext * const s = &h->s;
4977
4978         const int mba_xy = h->left_mb_xy[0];
4979         const int mbb_xy = h->top_mb_xy;
4980         int ctx = 0;
4981         int bits;
4982         assert(h->slice_type_nos == FF_B_TYPE);
4983
4984         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4985             ctx++;
4986         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4987             ctx++;
4988
4989         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4990             return 0; /* B_Direct_16x16 */
4991
4992         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4993             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4994         }
4995
4996         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4997         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4998         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4999         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5000         if( bits < 8 )
5001             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5002         else if( bits == 13 ) {
5003             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5004         } else if( bits == 14 )
5005             return 11; /* B_L1_L0_8x16 */
5006         else if( bits == 15 )
5007             return 22; /* B_8x8 */
5008
5009         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5010         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5011 }
5012
5013 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5014     MpegEncContext * const s = &h->s;
5015     int mba_xy, mbb_xy;
5016     int ctx = 0;
5017
5018     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5019         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5020         mba_xy = mb_xy - 1;
5021         if( (mb_y&1)
5022             && h->slice_table[mba_xy] == h->slice_num
5023             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5024             mba_xy += s->mb_stride;
5025         if( MB_FIELD ){
5026             mbb_xy = mb_xy - s->mb_stride;
5027             if( !(mb_y&1)
5028                 && h->slice_table[mbb_xy] == h->slice_num
5029                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5030                 mbb_xy -= s->mb_stride;
5031         }else
5032             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5033     }else{
5034         int mb_xy = h->mb_xy;
5035         mba_xy = mb_xy - 1;
5036         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5037     }
5038
5039     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5040         ctx++;
5041     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5042         ctx++;
5043
5044     if( h->slice_type_nos == FF_B_TYPE )
5045         ctx += 13;
5046     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5047 }
5048
5049 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5050     int mode = 0;
5051
5052     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5053         return pred_mode;
5054
5055     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5056     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5057     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5058
5059     if( mode >= pred_mode )
5060         return mode + 1;
5061     else
5062         return mode;
5063 }
5064
5065 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5066     const int mba_xy = h->left_mb_xy[0];
5067     const int mbb_xy = h->top_mb_xy;
5068
5069     int ctx = 0;
5070
5071     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5072     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5073         ctx++;
5074
5075     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5076         ctx++;
5077
5078     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5079         return 0;
5080
5081     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5082         return 1;
5083     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5084         return 2;
5085     else
5086         return 3;
5087 }
5088
5089 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5090     int cbp_b, cbp_a, ctx, cbp = 0;
5091
5092     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5093     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5094
5095     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5096     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5097     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5098     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5099     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5100     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5101     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5102     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5103     return cbp;
5104 }
5105 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5106     int ctx;
5107     int cbp_a, cbp_b;
5108
5109     cbp_a = (h->left_cbp>>4)&0x03;
5110     cbp_b = (h-> top_cbp>>4)&0x03;
5111
5112     ctx = 0;
5113     if( cbp_a > 0 ) ctx++;
5114     if( cbp_b > 0 ) ctx += 2;
5115     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5116         return 0;
5117
5118     ctx = 4;
5119     if( cbp_a == 2 ) ctx++;
5120     if( cbp_b == 2 ) ctx += 2;
5121     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5122 }
5123 static int decode_cabac_mb_dqp( H264Context *h) {
5124     int   ctx= h->last_qscale_diff != 0;
5125     int   val = 0;
5126
5127     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5128         ctx= 2+(ctx>>1);
5129         val++;
5130         if(val > 102) //prevent infinite loop
5131             return INT_MIN;
5132     }
5133
5134     if( val&0x01 )
5135         return   (val + 1)>>1 ;
5136     else
5137         return -((val + 1)>>1);
5138 }
5139 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5140     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5141         return 0;   /* 8x8 */
5142     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5143         return 1;   /* 8x4 */
5144     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5145         return 2;   /* 4x8 */
5146     return 3;       /* 4x4 */
5147 }
5148 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5149     int type;
5150     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5151         return 0;   /* B_Direct_8x8 */
5152     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5153         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5154     type = 3;
5155     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5156         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5157             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5158         type += 4;
5159     }
5160     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5161     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5162     return type;
5163 }
5164
5165 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5166     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5167 }
5168
5169 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5170     int refa = h->ref_cache[list][scan8[n] - 1];
5171     int refb = h->ref_cache[list][scan8[n] - 8];
5172     int ref  = 0;
5173     int ctx  = 0;
5174
5175     if( h->slice_type_nos == FF_B_TYPE) {
5176         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5177             ctx++;
5178         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5179             ctx += 2;
5180     } else {
5181         if( refa > 0 )
5182             ctx++;
5183         if( refb > 0 )
5184             ctx += 2;
5185     }
5186
5187     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5188         ref++;
5189         ctx = (ctx>>2)+4;
5190         if(ref >= 32 /*h->ref_list[list]*/){
5191             return -1;
5192         }
5193     }
5194     return ref;
5195 }
5196
5197 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5198     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5199                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5200     int ctxbase = (l == 0) ? 40 : 47;
5201     int mvd;
5202     int ctx = (amvd>2) + (amvd>32);
5203
5204     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5205         return 0;
5206
5207     mvd= 1;
5208     ctx= 3;
5209     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5210         mvd++;
5211         if( ctx < 6 )
5212             ctx++;
5213     }
5214
5215     if( mvd >= 9 ) {
5216         int k = 3;
5217         while( get_cabac_bypass( &h->cabac ) ) {
5218             mvd += 1 << k;
5219             k++;
5220             if(k>24){
5221                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5222                 return INT_MIN;
5223             }
5224         }
5225         while( k-- ) {
5226             if( get_cabac_bypass( &h->cabac ) )
5227                 mvd += 1 << k;
5228         }
5229     }
5230     return get_cabac_bypass_sign( &h->cabac, -mvd );
5231 }
5232
5233 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5234     int nza, nzb;
5235     int ctx = 0;
5236
5237     if( is_dc ) {
5238         if( cat == 0 ) {
5239             nza = h->left_cbp&0x100;
5240             nzb = h-> top_cbp&0x100;
5241         } else {
5242             nza = (h->left_cbp>>(6+idx))&0x01;
5243             nzb = (h-> top_cbp>>(6+idx))&0x01;
5244         }
5245     } else {
5246         assert(cat == 1 || cat == 2 || cat == 4);
5247         nza = h->non_zero_count_cache[scan8[idx] - 1];
5248         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5249     }
5250
5251     if( nza > 0 )
5252         ctx++;
5253
5254     if( nzb > 0 )
5255         ctx += 2;
5256
5257     return ctx + 4 * cat;
5258 }
5259
5260 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5261     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5262     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5263     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5264     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5265 };
5266
5267 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5268     static const int significant_coeff_flag_offset[2][6] = {
5269       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5270       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5271     };
5272     static const int last_coeff_flag_offset[2][6] = {
5273       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5274       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5275     };
5276     static const int coeff_abs_level_m1_offset[6] = {
5277         227+0, 227+10, 227+20, 227+30, 227+39, 426
5278     };
5279     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5280       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5281         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5282         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5283        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5284       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5285         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5286         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5287         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5288     };
5289     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5290      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5291      * map node ctx => cabac ctx for level=1 */
5292     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5293     /* map node ctx => cabac ctx for level>1 */
5294     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5295     static const uint8_t coeff_abs_level_transition[2][8] = {
5296     /* update node ctx after decoding a level=1 */
5297         { 1, 2, 3, 3, 4, 5, 6, 7 },
5298     /* update node ctx after decoding a level>1 */
5299         { 4, 4, 4, 4, 5, 6, 7, 7 }
5300     };
5301
5302     int index[64];
5303
5304     int av_unused last;
5305     int coeff_count = 0;
5306     int node_ctx = 0;
5307
5308     uint8_t *significant_coeff_ctx_base;
5309     uint8_t *last_coeff_ctx_base;
5310     uint8_t *abs_level_m1_ctx_base;
5311
5312 #if !ARCH_X86
5313 #define CABAC_ON_STACK
5314 #endif
5315 #ifdef CABAC_ON_STACK
5316 #define CC &cc
5317     CABACContext cc;
5318     cc.range     = h->cabac.range;
5319     cc.low       = h->cabac.low;
5320     cc.bytestream= h->cabac.bytestream;
5321 #else
5322 #define CC &h->cabac
5323 #endif
5324
5325
5326     /* cat: 0-> DC 16x16  n = 0
5327      *      1-> AC 16x16  n = luma4x4idx
5328      *      2-> Luma4x4   n = luma4x4idx
5329      *      3-> DC Chroma n = iCbCr
5330      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5331      *      5-> Luma8x8   n = 4 * luma8x8idx
5332      */
5333
5334     /* read coded block flag */
5335     if( is_dc || cat != 5 ) {
5336         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5337             if( !is_dc )
5338                 h->non_zero_count_cache[scan8[n]] = 0;
5339
5340 #ifdef CABAC_ON_STACK
5341             h->cabac.range     = cc.range     ;
5342             h->cabac.low       = cc.low       ;
5343             h->cabac.bytestream= cc.bytestream;
5344 #endif
5345             return;
5346         }
5347     }
5348
5349     significant_coeff_ctx_base = h->cabac_state
5350         + significant_coeff_flag_offset[MB_FIELD][cat];
5351     last_coeff_ctx_base = h->cabac_state
5352         + last_coeff_flag_offset[MB_FIELD][cat];
5353     abs_level_m1_ctx_base = h->cabac_state
5354         + coeff_abs_level_m1_offset[cat];
5355
5356     if( !is_dc && cat == 5 ) {
5357 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5358         for(last= 0; last < coefs; last++) { \
5359             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5360             if( get_cabac( CC, sig_ctx )) { \
5361                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5362                 index[coeff_count++] = last; \
5363                 if( get_cabac( CC, last_ctx ) ) { \
5364                     last= max_coeff; \
5365                     break; \
5366                 } \
5367             } \
5368         }\
5369         if( last == max_coeff -1 ) {\
5370             index[coeff_count++] = last;\
5371         }
5372         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5373 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5374         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5375     } else {
5376         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5377 #else
5378         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5379     } else {
5380         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5381 #endif
5382     }
5383     assert(coeff_count > 0);
5384
5385     if( is_dc ) {
5386         if( cat == 0 )
5387             h->cbp_table[h->mb_xy] |= 0x100;
5388         else
5389             h->cbp_table[h->mb_xy] |= 0x40 << n;
5390     } else {
5391         if( cat == 5 )
5392             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5393         else {
5394             assert( cat == 1 || cat == 2 || cat == 4 );
5395             h->non_zero_count_cache[scan8[n]] = coeff_count;
5396         }
5397     }
5398
5399     do {
5400         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5401
5402         int j= scantable[index[--coeff_count]];
5403
5404         if( get_cabac( CC, ctx ) == 0 ) {
5405             node_ctx = coeff_abs_level_transition[0][node_ctx];
5406             if( is_dc ) {
5407                 block[j] = get_cabac_bypass_sign( CC, -1);
5408             }else{
5409                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5410             }
5411         } else {
5412             int coeff_abs = 2;
5413             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5414             node_ctx = coeff_abs_level_transition[1][node_ctx];
5415
5416             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5417                 coeff_abs++;
5418             }
5419
5420             if( coeff_abs >= 15 ) {
5421                 int j = 0;
5422                 while( get_cabac_bypass( CC ) ) {
5423                     j++;
5424                 }
5425
5426                 coeff_abs=1;
5427                 while( j-- ) {
5428                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5429                 }
5430                 coeff_abs+= 14;
5431             }
5432
5433             if( is_dc ) {
5434                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5435             }else{
5436                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5437             }
5438         }
5439     } while( coeff_count );
5440 #ifdef CABAC_ON_STACK
5441             h->cabac.range     = cc.range     ;
5442             h->cabac.low       = cc.low       ;
5443             h->cabac.bytestream= cc.bytestream;
5444 #endif
5445
5446 }
5447
5448 #if !CONFIG_SMALL
5449 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5450     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5451 }
5452
5453 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5454     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5455 }
5456 #endif
5457
5458 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5459 #if CONFIG_SMALL
5460     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5461 #else
5462     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5463     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5464 #endif
5465 }
5466
5467 static inline void compute_mb_neighbors(H264Context *h)
5468 {
5469     MpegEncContext * const s = &h->s;
5470     const int mb_xy  = h->mb_xy;
5471     h->top_mb_xy     = mb_xy - s->mb_stride;
5472     h->left_mb_xy[0] = mb_xy - 1;
5473     if(FRAME_MBAFF){
5474         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5475         const int top_pair_xy      = pair_xy     - s->mb_stride;
5476         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5477         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5478         const int curr_mb_field_flag = MB_FIELD;
5479         const int bottom = (s->mb_y & 1);
5480
5481         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5482             h->top_mb_xy -= s->mb_stride;
5483         }
5484         if (!left_mb_field_flag == curr_mb_field_flag) {
5485             h->left_mb_xy[0] = pair_xy - 1;
5486         }
5487     } else if (FIELD_PICTURE) {
5488         h->top_mb_xy -= s->mb_stride;
5489     }
5490     return;
5491 }
5492
5493 /**
5494  * decodes a macroblock
5495  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5496  */
5497 static int decode_mb_cabac(H264Context *h) {
5498     MpegEncContext * const s = &h->s;
5499     int mb_xy;
5500     int mb_type, partition_count, cbp = 0;
5501     int dct8x8_allowed= h->pps.transform_8x8_mode;
5502
5503     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5504
5505     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5506     if( h->slice_type_nos != FF_I_TYPE ) {
5507         int skip;
5508         /* a skipped mb needs the aff flag from the following mb */
5509         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5510             predict_field_decoding_flag(h);
5511         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5512             skip = h->next_mb_skipped;
5513         else
5514             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5515         /* read skip flags */
5516         if( skip ) {
5517             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5518                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5519                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5520                 if(!h->next_mb_skipped)
5521                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5522             }
5523
5524             decode_mb_skip(h);
5525
5526             h->cbp_table[mb_xy] = 0;
5527             h->chroma_pred_mode_table[mb_xy] = 0;
5528             h->last_qscale_diff = 0;
5529
5530             return 0;
5531
5532         }
5533     }
5534     if(FRAME_MBAFF){
5535         if( (s->mb_y&1) == 0 )
5536             h->mb_mbaff =
5537             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5538     }
5539
5540     h->prev_mb_skipped = 0;
5541
5542     compute_mb_neighbors(h);
5543
5544     if( h->slice_type_nos == FF_B_TYPE ) {
5545         mb_type = decode_cabac_mb_type_b( h );
5546         if( mb_type < 23 ){
5547             partition_count= b_mb_type_info[mb_type].partition_count;
5548             mb_type=         b_mb_type_info[mb_type].type;
5549         }else{
5550             mb_type -= 23;
5551             goto decode_intra_mb;
5552         }
5553     } else if( h->slice_type_nos == FF_P_TYPE ) {
5554         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5555             /* P-type */
5556             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5557                 /* P_L0_D16x16, P_8x8 */
5558                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5559             } else {
5560                 /* P_L0_D8x16, P_L0_D16x8 */
5561                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5562             }
5563             partition_count= p_mb_type_info[mb_type].partition_count;
5564             mb_type=         p_mb_type_info[mb_type].type;
5565         } else {
5566             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5567             goto decode_intra_mb;
5568         }
5569     } else {
5570         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5571         if(h->slice_type == FF_SI_TYPE && mb_type)
5572             mb_type--;
5573         assert(h->slice_type_nos == FF_I_TYPE);
5574 decode_intra_mb:
5575         partition_count = 0;
5576         cbp= i_mb_type_info[mb_type].cbp;
5577         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5578         mb_type= i_mb_type_info[mb_type].type;
5579     }
5580     if(MB_FIELD)
5581         mb_type |= MB_TYPE_INTERLACED;
5582
5583     h->slice_table[ mb_xy ]= h->slice_num;
5584
5585     if(IS_INTRA_PCM(mb_type)) {
5586         const uint8_t *ptr;
5587
5588         // We assume these blocks are very rare so we do not optimize it.
5589         // FIXME The two following lines get the bitstream position in the cabac
5590         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5591         ptr= h->cabac.bytestream;
5592         if(h->cabac.low&0x1) ptr--;
5593         if(CABAC_BITS==16){
5594             if(h->cabac.low&0x1FF) ptr--;
5595         }
5596
5597         // The pixels are stored in the same order as levels in h->mb array.
5598         memcpy(h->mb, ptr, 256); ptr+=256;
5599         if(CHROMA){
5600             memcpy(h->mb+128, ptr, 128); ptr+=128;
5601         }
5602
5603         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5604
5605         // All blocks are present
5606         h->cbp_table[mb_xy] = 0x1ef;
5607         h->chroma_pred_mode_table[mb_xy] = 0;
5608         // In deblocking, the quantizer is 0
5609         s->current_picture.qscale_table[mb_xy]= 0;
5610         // All coeffs are present
5611         memset(h->non_zero_count[mb_xy], 16, 16);
5612         s->current_picture.mb_type[mb_xy]= mb_type;
5613         h->last_qscale_diff = 0;
5614         return 0;
5615     }
5616
5617     if(MB_MBAFF){
5618         h->ref_count[0] <<= 1;
5619         h->ref_count[1] <<= 1;
5620     }
5621
5622     fill_caches(h, mb_type, 0);
5623
5624     if( IS_INTRA( mb_type ) ) {
5625         int i, pred_mode;
5626         if( IS_INTRA4x4( mb_type ) ) {
5627             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5628                 mb_type |= MB_TYPE_8x8DCT;
5629                 for( i = 0; i < 16; i+=4 ) {
5630                     int pred = pred_intra_mode( h, i );
5631                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5632                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5633                 }
5634             } else {
5635                 for( i = 0; i < 16; i++ ) {
5636                     int pred = pred_intra_mode( h, i );
5637                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5638
5639                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5640                 }
5641             }
5642             write_back_intra_pred_mode(h);
5643             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5644         } else {
5645             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5646             if( h->intra16x16_pred_mode < 0 ) return -1;
5647         }
5648         if(CHROMA){
5649             h->chroma_pred_mode_table[mb_xy] =
5650             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5651
5652             pred_mode= check_intra_pred_mode( h, pred_mode );
5653             if( pred_mode < 0 ) return -1;
5654             h->chroma_pred_mode= pred_mode;
5655         }
5656     } else if( partition_count == 4 ) {
5657         int i, j, sub_partition_count[4], list, ref[2][4];
5658
5659         if( h->slice_type_nos == FF_B_TYPE ) {
5660             for( i = 0; i < 4; i++ ) {
5661                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5662                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5663                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5664             }
5665             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5666                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5667                 pred_direct_motion(h, &mb_type);
5668                 h->ref_cache[0][scan8[4]] =
5669                 h->ref_cache[1][scan8[4]] =
5670                 h->ref_cache[0][scan8[12]] =
5671                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5672                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5673                     for( i = 0; i < 4; i++ )
5674                         if( IS_DIRECT(h->sub_mb_type[i]) )
5675                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5676                 }
5677             }
5678         } else {
5679             for( i = 0; i < 4; i++ ) {
5680                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5681                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5682                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5683             }
5684         }
5685
5686         for( list = 0; list < h->list_count; list++ ) {
5687                 for( i = 0; i < 4; i++ ) {
5688                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5689                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5690                         if( h->ref_count[list] > 1 ){
5691                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5692                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5693                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5694                                 return -1;
5695                             }
5696                         }else
5697                             ref[list][i] = 0;
5698                     } else {
5699                         ref[list][i] = -1;
5700                     }
5701                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5702                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5703                 }
5704         }
5705
5706         if(dct8x8_allowed)
5707             dct8x8_allowed = get_dct8x8_allowed(h);
5708
5709         for(list=0; list<h->list_count; list++){
5710             for(i=0; i<4; i++){
5711                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5712                 if(IS_DIRECT(h->sub_mb_type[i])){
5713                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5714                     continue;
5715                 }
5716
5717                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5718                     const int sub_mb_type= h->sub_mb_type[i];
5719                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5720                     for(j=0; j<sub_partition_count[i]; j++){
5721                         int mpx, mpy;
5722                         int mx, my;
5723                         const int index= 4*i + block_width*j;
5724                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5725                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5726                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5727
5728                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5729                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5730                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5731
5732                         if(IS_SUB_8X8(sub_mb_type)){
5733                             mv_cache[ 1 ][0]=
5734                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5735                             mv_cache[ 1 ][1]=
5736                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5737
5738                             mvd_cache[ 1 ][0]=
5739                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5740                             mvd_cache[ 1 ][1]=
5741                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5742                         }else if(IS_SUB_8X4(sub_mb_type)){
5743                             mv_cache[ 1 ][0]= mx;
5744                             mv_cache[ 1 ][1]= my;
5745
5746                             mvd_cache[ 1 ][0]= mx - mpx;
5747                             mvd_cache[ 1 ][1]= my - mpy;
5748                         }else if(IS_SUB_4X8(sub_mb_type)){
5749                             mv_cache[ 8 ][0]= mx;
5750                             mv_cache[ 8 ][1]= my;
5751
5752                             mvd_cache[ 8 ][0]= mx - mpx;
5753                             mvd_cache[ 8 ][1]= my - mpy;
5754                         }
5755                         mv_cache[ 0 ][0]= mx;
5756                         mv_cache[ 0 ][1]= my;
5757
5758                         mvd_cache[ 0 ][0]= mx - mpx;
5759                         mvd_cache[ 0 ][1]= my - mpy;
5760                     }
5761                 }else{
5762                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5763                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5764                     p[0] = p[1] = p[8] = p[9] = 0;
5765                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5766                 }
5767             }
5768         }
5769     } else if( IS_DIRECT(mb_type) ) {
5770         pred_direct_motion(h, &mb_type);
5771         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5772         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5773         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5774     } else {
5775         int list, mx, my, i, mpx, mpy;
5776         if(IS_16X16(mb_type)){
5777             for(list=0; list<h->list_count; list++){
5778                 if(IS_DIR(mb_type, 0, list)){
5779                     int ref;
5780                     if(h->ref_count[list] > 1){
5781                         ref= decode_cabac_mb_ref(h, list, 0);
5782                         if(ref >= (unsigned)h->ref_count[list]){
5783                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5784                             return -1;
5785                         }
5786                     }else
5787                         ref=0;
5788                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5789                 }else
5790                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5791             }
5792             for(list=0; list<h->list_count; list++){
5793                 if(IS_DIR(mb_type, 0, list)){
5794                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5795
5796                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5797                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5798                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5799
5800                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5801                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5802                 }else
5803                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5804             }
5805         }
5806         else if(IS_16X8(mb_type)){
5807             for(list=0; list<h->list_count; list++){
5808                     for(i=0; i<2; i++){
5809                         if(IS_DIR(mb_type, i, list)){
5810                             int ref;
5811                             if(h->ref_count[list] > 1){
5812                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5813                                 if(ref >= (unsigned)h->ref_count[list]){
5814                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5815                                     return -1;
5816                                 }
5817                             }else
5818                                 ref=0;
5819                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5820                         }else
5821                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5822                     }
5823             }
5824             for(list=0; list<h->list_count; list++){
5825                 for(i=0; i<2; i++){
5826                     if(IS_DIR(mb_type, i, list)){
5827                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5828                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5829                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5830                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5831
5832                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5833                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5834                     }else{
5835                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5836                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5837                     }
5838                 }
5839             }
5840         }else{
5841             assert(IS_8X16(mb_type));
5842             for(list=0; list<h->list_count; list++){
5843                     for(i=0; i<2; i++){
5844                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5845                             int ref;
5846                             if(h->ref_count[list] > 1){
5847                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5848                                 if(ref >= (unsigned)h->ref_count[list]){
5849                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5850                                     return -1;
5851                                 }
5852                             }else
5853                                 ref=0;
5854                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5855                         }else
5856                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5857                     }
5858             }
5859             for(list=0; list<h->list_count; list++){
5860                 for(i=0; i<2; i++){
5861                     if(IS_DIR(mb_type, i, list)){
5862                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5863                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5864                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5865
5866                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5867                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5868                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5869                     }else{
5870                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5871                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5872                     }
5873                 }
5874             }
5875         }
5876     }
5877
5878    if( IS_INTER( mb_type ) ) {
5879         h->chroma_pred_mode_table[mb_xy] = 0;
5880         write_back_motion( h, mb_type );
5881    }
5882
5883     if( !IS_INTRA16x16( mb_type ) ) {
5884         cbp  = decode_cabac_mb_cbp_luma( h );
5885         if(CHROMA)
5886             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5887     }
5888
5889     h->cbp_table[mb_xy] = h->cbp = cbp;
5890
5891     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5892         if( decode_cabac_mb_transform_size( h ) )
5893             mb_type |= MB_TYPE_8x8DCT;
5894     }
5895     s->current_picture.mb_type[mb_xy]= mb_type;
5896
5897     if( cbp || IS_INTRA16x16( mb_type ) ) {
5898         const uint8_t *scan, *scan8x8, *dc_scan;
5899         const uint32_t *qmul;
5900         int dqp;
5901
5902         if(IS_INTERLACED(mb_type)){
5903             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5904             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5905             dc_scan= luma_dc_field_scan;
5906         }else{
5907             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5908             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5909             dc_scan= luma_dc_zigzag_scan;
5910         }
5911
5912         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5913         if( dqp == INT_MIN ){
5914             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5915             return -1;
5916         }
5917         s->qscale += dqp;
5918         if(((unsigned)s->qscale) > 51){
5919             if(s->qscale<0) s->qscale+= 52;
5920             else            s->qscale-= 52;
5921         }
5922         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5923         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5924
5925         if( IS_INTRA16x16( mb_type ) ) {
5926             int i;
5927             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5928             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5929
5930             if( cbp&15 ) {
5931                 qmul = h->dequant4_coeff[0][s->qscale];
5932                 for( i = 0; i < 16; i++ ) {
5933                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5934                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5935                 }
5936             } else {
5937                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5938             }
5939         } else {
5940             int i8x8, i4x4;
5941             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5942                 if( cbp & (1<<i8x8) ) {
5943                     if( IS_8x8DCT(mb_type) ) {
5944                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5945                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5946                     } else {
5947                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5948                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5949                             const int index = 4*i8x8 + i4x4;
5950                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5951 //START_TIMER
5952                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5953 //STOP_TIMER("decode_residual")
5954                         }
5955                     }
5956                 } else {
5957                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5958                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5959                 }
5960             }
5961         }
5962
5963         if( cbp&0x30 ){
5964             int c;
5965             for( c = 0; c < 2; c++ ) {
5966                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5967                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5968             }
5969         }
5970
5971         if( cbp&0x20 ) {
5972             int c, i;
5973             for( c = 0; c < 2; c++ ) {
5974                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5975                 for( i = 0; i < 4; i++ ) {
5976                     const int index = 16 + 4 * c + i;
5977                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5978                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5979                 }
5980             }
5981         } else {
5982             uint8_t * const nnz= &h->non_zero_count_cache[0];
5983             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5984             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5985         }
5986     } else {
5987         uint8_t * const nnz= &h->non_zero_count_cache[0];
5988         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5989         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5990         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5991         h->last_qscale_diff = 0;
5992     }
5993
5994     s->current_picture.qscale_table[mb_xy]= s->qscale;
5995     write_back_non_zero_count(h);
5996
5997     if(MB_MBAFF){
5998         h->ref_count[0] >>= 1;
5999         h->ref_count[1] >>= 1;
6000     }
6001
6002     return 0;
6003 }
6004
6005
6006 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6007     const int index_a = qp + h->slice_alpha_c0_offset;
6008     const int alpha = (alpha_table+52)[index_a];
6009     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6010     if (alpha ==0 || beta == 0) return;
6011
6012     if( bS[0] < 4 ) {
6013         int8_t tc[4];
6014         tc[0] = (tc0_table+52)[index_a][bS[0]];
6015         tc[1] = (tc0_table+52)[index_a][bS[1]];
6016         tc[2] = (tc0_table+52)[index_a][bS[2]];
6017         tc[3] = (tc0_table+52)[index_a][bS[3]];
6018         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6019     } else {
6020         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6021     }
6022 }
6023 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6024     const int index_a = qp + h->slice_alpha_c0_offset;
6025     const int alpha = (alpha_table+52)[index_a];
6026     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6027     if (alpha ==0 || beta == 0) return;
6028
6029     if( bS[0] < 4 ) {
6030         int8_t tc[4];
6031         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6032         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6033         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6034         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6035         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6036     } else {
6037         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6038     }
6039 }
6040
6041 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6042     int i;
6043     for( i = 0; i < 16; i++, pix += stride) {
6044         int index_a;
6045         int alpha;
6046         int beta;
6047
6048         int qp_index;
6049         int bS_index = (i >> 1);
6050         if (!MB_FIELD) {
6051             bS_index &= ~1;
6052             bS_index |= (i & 1);
6053         }
6054
6055         if( bS[bS_index] == 0 ) {
6056             continue;
6057         }
6058
6059         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6060         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6061         alpha = (alpha_table+52)[index_a];
6062         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6063
6064         if( bS[bS_index] < 4 ) {
6065             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6066             const int p0 = pix[-1];
6067             const int p1 = pix[-2];
6068             const int p2 = pix[-3];
6069             const int q0 = pix[0];
6070             const int q1 = pix[1];
6071             const int q2 = pix[2];
6072
6073             if( FFABS( p0 - q0 ) < alpha &&
6074                 FFABS( p1 - p0 ) < beta &&
6075                 FFABS( q1 - q0 ) < beta ) {
6076                 int tc = tc0;
6077                 int i_delta;
6078
6079                 if( FFABS( p2 - p0 ) < beta ) {
6080                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6081                     tc++;
6082                 }
6083                 if( FFABS( q2 - q0 ) < beta ) {
6084                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6085                     tc++;
6086                 }
6087
6088                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6089                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6090                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6091                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6092             }
6093         }else{
6094             const int p0 = pix[-1];
6095             const int p1 = pix[-2];
6096             const int p2 = pix[-3];
6097
6098             const int q0 = pix[0];
6099             const int q1 = pix[1];
6100             const int q2 = pix[2];
6101
6102             if( FFABS( p0 - q0 ) < alpha &&
6103                 FFABS( p1 - p0 ) < beta &&
6104                 FFABS( q1 - q0 ) < beta ) {
6105
6106                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6107                     if( FFABS( p2 - p0 ) < beta)
6108                     {
6109                         const int p3 = pix[-4];
6110                         /* p0', p1', p2' */
6111                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6112                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6113                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6114                     } else {
6115                         /* p0' */
6116                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6117                     }
6118                     if( FFABS( q2 - q0 ) < beta)
6119                     {
6120                         const int q3 = pix[3];
6121                         /* q0', q1', q2' */
6122                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6123                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6124                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6125                     } else {
6126                         /* q0' */
6127                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6128                     }
6129                 }else{
6130                     /* p0', q0' */
6131                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6132                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6133                 }
6134                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6135             }
6136         }
6137     }
6138 }
6139 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6140     int i;
6141     for( i = 0; i < 8; i++, pix += stride) {
6142         int index_a;
6143         int alpha;
6144         int beta;
6145
6146         int qp_index;
6147         int bS_index = i;
6148
6149         if( bS[bS_index] == 0 ) {
6150             continue;
6151         }
6152
6153         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6154         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6155         alpha = (alpha_table+52)[index_a];
6156         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6157
6158         if( bS[bS_index] < 4 ) {
6159             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6160             const int p0 = pix[-1];
6161             const int p1 = pix[-2];
6162             const int q0 = pix[0];
6163             const int q1 = pix[1];
6164
6165             if( FFABS( p0 - q0 ) < alpha &&
6166                 FFABS( p1 - p0 ) < beta &&
6167                 FFABS( q1 - q0 ) < beta ) {
6168                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6169
6170                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6171                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6172                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6173             }
6174         }else{
6175             const int p0 = pix[-1];
6176             const int p1 = pix[-2];
6177             const int q0 = pix[0];
6178             const int q1 = pix[1];
6179
6180             if( FFABS( p0 - q0 ) < alpha &&
6181                 FFABS( p1 - p0 ) < beta &&
6182                 FFABS( q1 - q0 ) < beta ) {
6183
6184                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6185                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6186                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6187             }
6188         }
6189     }
6190 }
6191
6192 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6193     const int index_a = qp + h->slice_alpha_c0_offset;
6194     const int alpha = (alpha_table+52)[index_a];
6195     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6196     if (alpha ==0 || beta == 0) return;
6197
6198     if( bS[0] < 4 ) {
6199         int8_t tc[4];
6200         tc[0] = (tc0_table+52)[index_a][bS[0]];
6201         tc[1] = (tc0_table+52)[index_a][bS[1]];
6202         tc[2] = (tc0_table+52)[index_a][bS[2]];
6203         tc[3] = (tc0_table+52)[index_a][bS[3]];
6204         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6205     } else {
6206         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6207     }
6208 }
6209
6210 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6211     const int index_a = qp + h->slice_alpha_c0_offset;
6212     const int alpha = (alpha_table+52)[index_a];
6213     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6214     if (alpha ==0 || beta == 0) return;
6215
6216     if( bS[0] < 4 ) {
6217         int8_t tc[4];
6218         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6219         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6220         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6221         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6222         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6223     } else {
6224         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6225     }
6226 }
6227
6228 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6229     MpegEncContext * const s = &h->s;
6230     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6231     int mb_xy, mb_type;
6232     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6233
6234     mb_xy = h->mb_xy;
6235
6236     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6237         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6238        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6239                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6240         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6241         return;
6242     }
6243     assert(!FRAME_MBAFF);
6244
6245     mb_type = s->current_picture.mb_type[mb_xy];
6246     qp = s->current_picture.qscale_table[mb_xy];
6247     qp0 = s->current_picture.qscale_table[mb_xy-1];
6248     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6249     qpc = get_chroma_qp( h, 0, qp );
6250     qpc0 = get_chroma_qp( h, 0, qp0 );
6251     qpc1 = get_chroma_qp( h, 0, qp1 );
6252     qp0 = (qp + qp0 + 1) >> 1;
6253     qp1 = (qp + qp1 + 1) >> 1;
6254     qpc0 = (qpc + qpc0 + 1) >> 1;
6255     qpc1 = (qpc + qpc1 + 1) >> 1;
6256     qp_thresh = 15 - h->slice_alpha_c0_offset;
6257     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6258        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6259         return;
6260
6261     if( IS_INTRA(mb_type) ) {
6262         int16_t bS4[4] = {4,4,4,4};
6263         int16_t bS3[4] = {3,3,3,3};
6264         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6265         if( IS_8x8DCT(mb_type) ) {
6266             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6267             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6268             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6269             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6270         } else {
6271             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6272             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6273             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6274             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6275             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6276             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6277             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6278             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6279         }
6280         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6281         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6282         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6283         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6284         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6285         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6286         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6287         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6288         return;
6289     } else {
6290         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6291         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6292         int edges;
6293         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6294             edges = 4;
6295             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6296         } else {
6297             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6298                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6299             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6300                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6301                              ? 3 : 0;
6302             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6303             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6304             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6305                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6306         }
6307         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6308             bSv[0][0] = 0x0004000400040004ULL;
6309         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6310             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6311
6312 #define FILTER(hv,dir,edge)\
6313         if(bSv[dir][edge]) {\
6314             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6315             if(!(edge&1)) {\
6316                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6317                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6318             }\
6319         }
6320         if( edges == 1 ) {
6321             FILTER(v,0,0);
6322             FILTER(h,1,0);
6323         } else if( IS_8x8DCT(mb_type) ) {
6324             FILTER(v,0,0);
6325             FILTER(v,0,2);
6326             FILTER(h,1,0);
6327             FILTER(h,1,2);
6328         } else {
6329             FILTER(v,0,0);
6330             FILTER(v,0,1);
6331             FILTER(v,0,2);
6332             FILTER(v,0,3);
6333             FILTER(h,1,0);
6334             FILTER(h,1,1);
6335             FILTER(h,1,2);
6336             FILTER(h,1,3);
6337         }
6338 #undef FILTER
6339     }
6340 }
6341
6342
6343 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6344     MpegEncContext * const s = &h->s;
6345     int edge;
6346     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6347     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6348     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6349     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6350     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6351
6352     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6353                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6354     // how often to recheck mv-based bS when iterating between edges
6355     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6356                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6357     // how often to recheck mv-based bS when iterating along each edge
6358     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6359
6360     if (first_vertical_edge_done) {
6361         start = 1;
6362     }
6363
6364     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6365         start = 1;
6366
6367     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6368         && !IS_INTERLACED(mb_type)
6369         && IS_INTERLACED(mbm_type)
6370         ) {
6371         // This is a special case in the norm where the filtering must
6372         // be done twice (one each of the field) even if we are in a
6373         // frame macroblock.
6374         //
6375         static const int nnz_idx[4] = {4,5,6,3};
6376         unsigned int tmp_linesize   = 2 *   linesize;
6377         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6378         int mbn_xy = mb_xy - 2 * s->mb_stride;
6379         int qp;
6380         int i, j;
6381         int16_t bS[4];
6382
6383         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6384             if( IS_INTRA(mb_type) ||
6385                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6386                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6387             } else {
6388                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6389                 for( i = 0; i < 4; i++ ) {
6390                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6391                         mbn_nnz[nnz_idx[i]] != 0 )
6392                         bS[i] = 2;
6393                     else
6394                         bS[i] = 1;
6395                 }
6396             }
6397             // Do not use s->qscale as luma quantizer because it has not the same
6398             // value in IPCM macroblocks.
6399             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6400             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6401             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6402             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6403             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6404                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6405             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6406                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6407         }
6408
6409         start = 1;
6410     }
6411
6412     /* Calculate bS */
6413     for( edge = start; edge < edges; edge++ ) {
6414         /* mbn_xy: neighbor macroblock */
6415         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6416         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6417         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6418         int16_t bS[4];
6419         int qp;
6420
6421         if( (edge&1) && IS_8x8DCT(mb_type) )
6422             continue;
6423
6424         if( IS_INTRA(mb_type) ||
6425             IS_INTRA(mbn_type) ) {
6426             int value;
6427             if (edge == 0) {
6428                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6429                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6430                 ) {
6431                     value = 4;
6432                 } else {
6433                     value = 3;
6434                 }
6435             } else {
6436                 value = 3;
6437             }
6438             bS[0] = bS[1] = bS[2] = bS[3] = value;
6439         } else {
6440             int i, l;
6441             int mv_done;
6442
6443             if( edge & mask_edge ) {
6444                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6445                 mv_done = 1;
6446             }
6447             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6448                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6449                 mv_done = 1;
6450             }
6451             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6452                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6453                 int bn_idx= b_idx - (dir ? 8:1);
6454                 int v = 0;
6455
6456                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6457                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6458                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6459                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6460                 }
6461
6462                 if(h->slice_type_nos == FF_B_TYPE && v){
6463                     v=0;
6464                     for( l = 0; !v && l < 2; l++ ) {
6465                         int ln= 1-l;
6466                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6467                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6468                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6469                     }
6470                 }
6471
6472                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6473                 mv_done = 1;
6474             }
6475             else
6476                 mv_done = 0;
6477
6478             for( i = 0; i < 4; i++ ) {
6479                 int x = dir == 0 ? edge : i;
6480                 int y = dir == 0 ? i    : edge;
6481                 int b_idx= 8 + 4 + x + 8*y;
6482                 int bn_idx= b_idx - (dir ? 8:1);
6483
6484                 if( h->non_zero_count_cache[b_idx] |
6485                     h->non_zero_count_cache[bn_idx] ) {
6486                     bS[i] = 2;
6487                 }
6488                 else if(!mv_done)
6489                 {
6490                     bS[i] = 0;
6491                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6492                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6493                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6494                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6495                             bS[i] = 1;
6496                             break;
6497                         }
6498                     }
6499
6500                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6501                         bS[i] = 0;
6502                         for( l = 0; l < 2; l++ ) {
6503                             int ln= 1-l;
6504                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6505                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6506                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6507                                 bS[i] = 1;
6508                                 break;
6509                             }
6510                         }
6511                     }
6512                 }
6513             }
6514
6515             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6516                 continue;
6517         }
6518
6519         /* Filter edge */
6520         // Do not use s->qscale as luma quantizer because it has not the same
6521         // value in IPCM macroblocks.
6522         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6523         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6524         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6525         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6526         if( dir == 0 ) {
6527             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6528             if( (edge&1) == 0 ) {
6529                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6530                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6531                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6532                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6533             }
6534         } else {
6535             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6536             if( (edge&1) == 0 ) {
6537                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6538                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6539                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6540                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6541             }
6542         }
6543     }
6544 }
6545
6546 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6547     MpegEncContext * const s = &h->s;
6548     const int mb_xy= mb_x + mb_y*s->mb_stride;
6549     const int mb_type = s->current_picture.mb_type[mb_xy];
6550     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6551     int first_vertical_edge_done = 0;
6552     av_unused int dir;
6553
6554     //for sufficiently low qp, filtering wouldn't do anything
6555     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6556     if(!FRAME_MBAFF){
6557         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6558         int qp = s->current_picture.qscale_table[mb_xy];
6559         if(qp <= qp_thresh
6560            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6561            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6562             return;
6563         }
6564     }
6565
6566     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6567     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6568         int top_type, left_type[2];
6569         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6570         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6571         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6572
6573         if(IS_8x8DCT(top_type)){
6574             h->non_zero_count_cache[4+8*0]=
6575             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6576             h->non_zero_count_cache[6+8*0]=
6577             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6578         }
6579         if(IS_8x8DCT(left_type[0])){
6580             h->non_zero_count_cache[3+8*1]=
6581             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6582         }
6583         if(IS_8x8DCT(left_type[1])){
6584             h->non_zero_count_cache[3+8*3]=
6585             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6586         }
6587
6588         if(IS_8x8DCT(mb_type)){
6589             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6590             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6591
6592             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6593             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6594
6595             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6596             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6597
6598             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6599             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6600         }
6601     }
6602
6603     if (FRAME_MBAFF
6604             // left mb is in picture
6605             && h->slice_table[mb_xy-1] != 0xFFFF
6606             // and current and left pair do not have the same interlaced type
6607             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6608             // and left mb is in the same slice if deblocking_filter == 2
6609             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6610         /* First vertical edge is different in MBAFF frames
6611          * There are 8 different bS to compute and 2 different Qp
6612          */
6613         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6614         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6615         int16_t bS[8];
6616         int qp[2];
6617         int bqp[2];
6618         int rqp[2];
6619         int mb_qp, mbn0_qp, mbn1_qp;
6620         int i;
6621         first_vertical_edge_done = 1;
6622
6623         if( IS_INTRA(mb_type) )
6624             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6625         else {
6626             for( i = 0; i < 8; i++ ) {
6627                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6628
6629                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6630                     bS[i] = 4;
6631                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6632                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6633                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6634                                                                        :
6635                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6636                     bS[i] = 2;
6637                 else
6638                     bS[i] = 1;
6639             }
6640         }
6641
6642         mb_qp = s->current_picture.qscale_table[mb_xy];
6643         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6644         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6645         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6646         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6647                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6648         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6649                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6650         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6651         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6652                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6653         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6654                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6655
6656         /* Filter edge */
6657         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6658         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6659         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6660         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6661         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6662     }
6663
6664 #if CONFIG_SMALL
6665     for( dir = 0; dir < 2; dir++ )
6666         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6667 #else
6668     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6669     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6670 #endif
6671 }
6672
6673 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6674     H264Context *h = *(void**)arg;
6675     MpegEncContext * const s = &h->s;
6676     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6677
6678     s->mb_skip_run= -1;
6679
6680     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6681                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6682
6683     if( h->pps.cabac ) {
6684         int i;
6685
6686         /* realign */
6687         align_get_bits( &s->gb );
6688
6689         /* init cabac */
6690         ff_init_cabac_states( &h->cabac);
6691         ff_init_cabac_decoder( &h->cabac,
6692                                s->gb.buffer + get_bits_count(&s->gb)/8,
6693                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6694         /* calculate pre-state */
6695         for( i= 0; i < 460; i++ ) {
6696             int pre;
6697             if( h->slice_type_nos == FF_I_TYPE )
6698                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6699             else
6700                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6701
6702             if( pre <= 63 )
6703                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6704             else
6705                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6706         }
6707
6708         for(;;){
6709 //START_TIMER
6710             int ret = decode_mb_cabac(h);
6711             int eos;
6712 //STOP_TIMER("decode_mb_cabac")
6713
6714             if(ret>=0) hl_decode_mb(h);
6715
6716             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6717                 s->mb_y++;
6718
6719                 ret = decode_mb_cabac(h);
6720
6721                 if(ret>=0) hl_decode_mb(h);
6722                 s->mb_y--;
6723             }
6724             eos = get_cabac_terminate( &h->cabac );
6725
6726             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6727                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6728                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6729                 return -1;
6730             }
6731
6732             if( ++s->mb_x >= s->mb_width ) {
6733                 s->mb_x = 0;
6734                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6735                 ++s->mb_y;
6736                 if(FIELD_OR_MBAFF_PICTURE) {
6737                     ++s->mb_y;
6738                 }
6739             }
6740
6741             if( eos || s->mb_y >= s->mb_height ) {
6742                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6743                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6744                 return 0;
6745             }
6746         }
6747
6748     } else {
6749         for(;;){
6750             int ret = decode_mb_cavlc(h);
6751
6752             if(ret>=0) hl_decode_mb(h);
6753
6754             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6755                 s->mb_y++;
6756                 ret = decode_mb_cavlc(h);
6757
6758                 if(ret>=0) hl_decode_mb(h);
6759                 s->mb_y--;
6760             }
6761
6762             if(ret<0){
6763                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6764                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6765
6766                 return -1;
6767             }
6768
6769             if(++s->mb_x >= s->mb_width){
6770                 s->mb_x=0;
6771                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6772                 ++s->mb_y;
6773                 if(FIELD_OR_MBAFF_PICTURE) {
6774                     ++s->mb_y;
6775                 }
6776                 if(s->mb_y >= s->mb_height){
6777                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6778
6779                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6780                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6781
6782                         return 0;
6783                     }else{
6784                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6785
6786                         return -1;
6787                     }
6788                 }
6789             }
6790
6791             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6792                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6793                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6794                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6795
6796                     return 0;
6797                 }else{
6798                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6799
6800                     return -1;
6801                 }
6802             }
6803         }
6804     }
6805
6806 #if 0
6807     for(;s->mb_y < s->mb_height; s->mb_y++){
6808         for(;s->mb_x < s->mb_width; s->mb_x++){
6809             int ret= decode_mb(h);
6810
6811             hl_decode_mb(h);
6812
6813             if(ret<0){
6814                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6815                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6816
6817                 return -1;
6818             }
6819
6820             if(++s->mb_x >= s->mb_width){
6821                 s->mb_x=0;
6822                 if(++s->mb_y >= s->mb_height){
6823                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6824                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6825
6826                         return 0;
6827                     }else{
6828                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6829
6830                         return -1;
6831                     }
6832                 }
6833             }
6834
6835             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6836                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6837                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6838
6839                     return 0;
6840                 }else{
6841                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6842
6843                     return -1;
6844                 }
6845             }
6846         }
6847         s->mb_x=0;
6848         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6849     }
6850 #endif
6851     return -1; //not reached
6852 }
6853
6854 static int decode_picture_timing(H264Context *h){
6855     MpegEncContext * const s = &h->s;
6856     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6857         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6858         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6859     }
6860     if(h->sps.pic_struct_present_flag){
6861         unsigned int i, num_clock_ts;
6862         h->sei_pic_struct = get_bits(&s->gb, 4);
6863         h->sei_ct_type    = 0;
6864
6865         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6866             return -1;
6867
6868         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6869
6870         for (i = 0 ; i < num_clock_ts ; i++){
6871             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6872                 unsigned int full_timestamp_flag;
6873                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6874                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6875                 skip_bits(&s->gb, 5);                 /* counting_type */
6876                 full_timestamp_flag = get_bits(&s->gb, 1);
6877                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6878                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6879                 skip_bits(&s->gb, 8);                 /* n_frames */
6880                 if(full_timestamp_flag){
6881                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6882                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6883                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6884                 }else{
6885                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6886                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6887                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6888                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6889                             if(get_bits(&s->gb, 1))   /* hours_flag */
6890                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6891                         }
6892                     }
6893                 }
6894                 if(h->sps.time_offset_length > 0)
6895                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6896             }
6897         }
6898
6899         if(s->avctx->debug & FF_DEBUG_PICT_INFO)
6900             av_log(s->avctx, AV_LOG_DEBUG, "ct_type:%X pic_struct:%d\n", h->sei_ct_type, h->sei_pic_struct);
6901     }
6902     return 0;
6903 }
6904
6905 static int decode_unregistered_user_data(H264Context *h, int size){
6906     MpegEncContext * const s = &h->s;
6907     uint8_t user_data[16+256];
6908     int e, build, i;
6909
6910     if(size<16)
6911         return -1;
6912
6913     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6914         user_data[i]= get_bits(&s->gb, 8);
6915     }
6916
6917     user_data[i]= 0;
6918     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6919     if(e==1 && build>=0)
6920         h->x264_build= build;
6921
6922     if(s->avctx->debug & FF_DEBUG_BUGS)
6923         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6924
6925     for(; i<size; i++)
6926         skip_bits(&s->gb, 8);
6927
6928     return 0;
6929 }
6930
6931 static int decode_recovery_point(H264Context *h){
6932     MpegEncContext * const s = &h->s;
6933
6934     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6935     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6936
6937     return 0;
6938 }
6939
6940 static int decode_buffering_period(H264Context *h){
6941     MpegEncContext * const s = &h->s;
6942     unsigned int sps_id;
6943     int sched_sel_idx;
6944     SPS *sps;
6945
6946     sps_id = get_ue_golomb_31(&s->gb);
6947     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6948         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6949         return -1;
6950     }
6951     sps = h->sps_buffers[sps_id];
6952
6953     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6954     if (sps->nal_hrd_parameters_present_flag) {
6955         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6956             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6957             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6958         }
6959     }
6960     if (sps->vcl_hrd_parameters_present_flag) {
6961         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6962             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6963             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6964         }
6965     }
6966
6967     h->sei_buffering_period_present = 1;
6968     return 0;
6969 }
6970
6971 int ff_h264_decode_sei(H264Context *h){
6972     MpegEncContext * const s = &h->s;
6973
6974     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6975         int size, type;
6976
6977         type=0;
6978         do{
6979             type+= show_bits(&s->gb, 8);
6980         }while(get_bits(&s->gb, 8) == 255);
6981
6982         size=0;
6983         do{
6984             size+= show_bits(&s->gb, 8);
6985         }while(get_bits(&s->gb, 8) == 255);
6986
6987         switch(type){
6988         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6989             if(decode_picture_timing(h) < 0)
6990                 return -1;
6991             break;
6992         case SEI_TYPE_USER_DATA_UNREGISTERED:
6993             if(decode_unregistered_user_data(h, size) < 0)
6994                 return -1;
6995             break;
6996         case SEI_TYPE_RECOVERY_POINT:
6997             if(decode_recovery_point(h) < 0)
6998                 return -1;
6999             break;
7000         case SEI_BUFFERING_PERIOD:
7001             if(decode_buffering_period(h) < 0)
7002                 return -1;
7003             break;
7004         default:
7005             skip_bits(&s->gb, 8*size);
7006         }
7007
7008         //FIXME check bits here
7009         align_get_bits(&s->gb);
7010     }
7011
7012     return 0;
7013 }
7014
7015 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7016     MpegEncContext * const s = &h->s;
7017     int cpb_count, i;
7018     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7019
7020     if(cpb_count > 32U){
7021         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7022         return -1;
7023     }
7024
7025     get_bits(&s->gb, 4); /* bit_rate_scale */
7026     get_bits(&s->gb, 4); /* cpb_size_scale */
7027     for(i=0; i<cpb_count; i++){
7028         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7029         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7030         get_bits1(&s->gb);     /* cbr_flag */
7031     }
7032     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7033     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7034     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7035     sps->time_offset_length = get_bits(&s->gb, 5);
7036     sps->cpb_cnt = cpb_count;
7037     return 0;
7038 }
7039
7040 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7041     MpegEncContext * const s = &h->s;
7042     int aspect_ratio_info_present_flag;
7043     unsigned int aspect_ratio_idc;
7044
7045     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7046
7047     if( aspect_ratio_info_present_flag ) {
7048         aspect_ratio_idc= get_bits(&s->gb, 8);
7049         if( aspect_ratio_idc == EXTENDED_SAR ) {
7050             sps->sar.num= get_bits(&s->gb, 16);
7051             sps->sar.den= get_bits(&s->gb, 16);
7052         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7053             sps->sar=  pixel_aspect[aspect_ratio_idc];
7054         }else{
7055             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7056             return -1;
7057         }
7058     }else{
7059         sps->sar.num=
7060         sps->sar.den= 0;
7061     }
7062 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7063
7064     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7065         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7066     }
7067
7068     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7069         get_bits(&s->gb, 3);    /* video_format */
7070         get_bits1(&s->gb);      /* video_full_range_flag */
7071         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7072             get_bits(&s->gb, 8); /* colour_primaries */
7073             get_bits(&s->gb, 8); /* transfer_characteristics */
7074             get_bits(&s->gb, 8); /* matrix_coefficients */
7075         }
7076     }
7077
7078     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7079         s->avctx->chroma_sample_location = get_ue_golomb(&s->gb)+1;  /* chroma_sample_location_type_top_field */
7080         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7081     }
7082
7083     sps->timing_info_present_flag = get_bits1(&s->gb);
7084     if(sps->timing_info_present_flag){
7085         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7086         sps->time_scale = get_bits_long(&s->gb, 32);
7087         if(sps->num_units_in_tick-1 > 0x7FFFFFFEU || sps->time_scale-1 > 0x7FFFFFFEU){
7088             av_log(h->s.avctx, AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
7089             return -1;
7090         }
7091         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7092     }
7093
7094     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7095     if(sps->nal_hrd_parameters_present_flag)
7096         if(decode_hrd_parameters(h, sps) < 0)
7097             return -1;
7098     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7099     if(sps->vcl_hrd_parameters_present_flag)
7100         if(decode_hrd_parameters(h, sps) < 0)
7101             return -1;
7102     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7103         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7104     sps->pic_struct_present_flag = get_bits1(&s->gb);
7105
7106     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7107     if(sps->bitstream_restriction_flag){
7108         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7109         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7110         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7111         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7112         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7113         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7114         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7115
7116         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7117             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7118             return -1;
7119         }
7120     }
7121
7122     return 0;
7123 }
7124
7125 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7126                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7127     MpegEncContext * const s = &h->s;
7128     int i, last = 8, next = 8;
7129     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7130     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7131         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7132     else
7133     for(i=0;i<size;i++){
7134         if(next)
7135             next = (last + get_se_golomb(&s->gb)) & 0xff;
7136         if(!i && !next){ /* matrix not written, we use the preset one */
7137             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7138             break;
7139         }
7140         last = factors[scan[i]] = next ? next : last;
7141     }
7142 }
7143
7144 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7145                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7146     MpegEncContext * const s = &h->s;
7147     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7148     const uint8_t *fallback[4] = {
7149         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7150         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7151         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7152         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7153     };
7154     if(get_bits1(&s->gb)){
7155         sps->scaling_matrix_present |= is_sps;
7156         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7157         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7158         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7159         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7160         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7161         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7162         if(is_sps || pps->transform_8x8_mode){
7163             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7164             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7165         }
7166     }
7167 }
7168
7169 int ff_h264_decode_seq_parameter_set(H264Context *h){
7170     MpegEncContext * const s = &h->s;
7171     int profile_idc, level_idc;
7172     unsigned int sps_id;
7173     int i;
7174     SPS *sps;
7175
7176     profile_idc= get_bits(&s->gb, 8);
7177     get_bits1(&s->gb);   //constraint_set0_flag
7178     get_bits1(&s->gb);   //constraint_set1_flag
7179     get_bits1(&s->gb);   //constraint_set2_flag
7180     get_bits1(&s->gb);   //constraint_set3_flag
7181     get_bits(&s->gb, 4); // reserved
7182     level_idc= get_bits(&s->gb, 8);
7183     sps_id= get_ue_golomb_31(&s->gb);
7184
7185     if(sps_id >= MAX_SPS_COUNT) {
7186         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7187         return -1;
7188     }
7189     sps= av_mallocz(sizeof(SPS));
7190     if(sps == NULL)
7191         return -1;
7192
7193     sps->profile_idc= profile_idc;
7194     sps->level_idc= level_idc;
7195
7196     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7197     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7198     sps->scaling_matrix_present = 0;
7199
7200     if(sps->profile_idc >= 100){ //high profile
7201         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7202         if(sps->chroma_format_idc == 3)
7203             sps->residual_color_transform_flag = get_bits1(&s->gb);
7204         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7205         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7206         sps->transform_bypass = get_bits1(&s->gb);
7207         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7208     }else{
7209         sps->chroma_format_idc= 1;
7210     }
7211
7212     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7213     sps->poc_type= get_ue_golomb_31(&s->gb);
7214
7215     if(sps->poc_type == 0){ //FIXME #define
7216         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7217     } else if(sps->poc_type == 1){//FIXME #define
7218         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7219         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7220         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7221         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7222
7223         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7224             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7225             goto fail;
7226         }
7227
7228         for(i=0; i<sps->poc_cycle_length; i++)
7229             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7230     }else if(sps->poc_type != 2){
7231         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7232         goto fail;
7233     }
7234
7235     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7236     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7237         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7238         goto fail;
7239     }
7240     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7241     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7242     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7243     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7244        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7245         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7246         goto fail;
7247     }
7248
7249     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7250     if(!sps->frame_mbs_only_flag)
7251         sps->mb_aff= get_bits1(&s->gb);
7252     else
7253         sps->mb_aff= 0;
7254
7255     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7256
7257 #ifndef ALLOW_INTERLACE
7258     if(sps->mb_aff)
7259         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7260 #endif
7261     sps->crop= get_bits1(&s->gb);
7262     if(sps->crop){
7263         sps->crop_left  = get_ue_golomb(&s->gb);
7264         sps->crop_right = get_ue_golomb(&s->gb);
7265         sps->crop_top   = get_ue_golomb(&s->gb);
7266         sps->crop_bottom= get_ue_golomb(&s->gb);
7267         if(sps->crop_left || sps->crop_top){
7268             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7269         }
7270         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7271             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7272         }
7273     }else{
7274         sps->crop_left  =
7275         sps->crop_right =
7276         sps->crop_top   =
7277         sps->crop_bottom= 0;
7278     }
7279
7280     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7281     if( sps->vui_parameters_present_flag )
7282         if (decode_vui_parameters(h, sps) < 0)
7283             goto fail;
7284
7285     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7286         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7287                sps_id, sps->profile_idc, sps->level_idc,
7288                sps->poc_type,
7289                sps->ref_frame_count,
7290                sps->mb_width, sps->mb_height,
7291                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7292                sps->direct_8x8_inference_flag ? "8B8" : "",
7293                sps->crop_left, sps->crop_right,
7294                sps->crop_top, sps->crop_bottom,
7295                sps->vui_parameters_present_flag ? "VUI" : "",
7296                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7297                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7298                sps->timing_info_present_flag ? sps->time_scale : 0
7299                );
7300     }
7301
7302     av_free(h->sps_buffers[sps_id]);
7303     h->sps_buffers[sps_id]= sps;
7304     h->sps = *sps;
7305     return 0;
7306 fail:
7307     av_free(sps);
7308     return -1;
7309 }
7310
7311 static void
7312 build_qp_table(PPS *pps, int t, int index)
7313 {
7314     int i;
7315     for(i = 0; i < 52; i++)
7316         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7317 }
7318
7319 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7320     MpegEncContext * const s = &h->s;
7321     unsigned int pps_id= get_ue_golomb(&s->gb);
7322     PPS *pps;
7323
7324     if(pps_id >= MAX_PPS_COUNT) {
7325         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7326         return -1;
7327     }
7328
7329     pps= av_mallocz(sizeof(PPS));
7330     if(pps == NULL)
7331         return -1;
7332     pps->sps_id= get_ue_golomb_31(&s->gb);
7333     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7334         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7335         goto fail;
7336     }
7337
7338     pps->cabac= get_bits1(&s->gb);
7339     pps->pic_order_present= get_bits1(&s->gb);
7340     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7341     if(pps->slice_group_count > 1 ){
7342         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7343         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7344         switch(pps->mb_slice_group_map_type){
7345         case 0:
7346 #if 0
7347 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7348 |    run_length[ i ]                                |1  |ue(v)   |
7349 #endif
7350             break;
7351         case 2:
7352 #if 0
7353 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7354 |{                                                  |   |        |
7355 |    top_left_mb[ i ]                               |1  |ue(v)   |
7356 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7357 |   }                                               |   |        |
7358 #endif
7359             break;
7360         case 3:
7361         case 4:
7362         case 5:
7363 #if 0
7364 |   slice_group_change_direction_flag               |1  |u(1)    |
7365 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7366 #endif
7367             break;
7368         case 6:
7369 #if 0
7370 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7371 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7372 |)                                                  |   |        |
7373 |    slice_group_id[ i ]                            |1  |u(v)    |
7374 #endif
7375             break;
7376         }
7377     }
7378     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7379     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7380     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7381         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7382         goto fail;
7383     }
7384
7385     pps->weighted_pred= get_bits1(&s->gb);
7386     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7387     pps->init_qp= get_se_golomb(&s->gb) + 26;
7388     pps->init_qs= get_se_golomb(&s->gb) + 26;
7389     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7390     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7391     pps->constrained_intra_pred= get_bits1(&s->gb);
7392     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7393
7394     pps->transform_8x8_mode= 0;
7395     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7396     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7397     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7398
7399     if(get_bits_count(&s->gb) < bit_length){
7400         pps->transform_8x8_mode= get_bits1(&s->gb);
7401         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7402         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7403     } else {
7404         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7405     }
7406
7407     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7408     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7409     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7410         h->pps.chroma_qp_diff= 1;
7411
7412     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7413         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7414                pps_id, pps->sps_id,
7415                pps->cabac ? "CABAC" : "CAVLC",
7416                pps->slice_group_count,
7417                pps->ref_count[0], pps->ref_count[1],
7418                pps->weighted_pred ? "weighted" : "",
7419                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7420                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7421                pps->constrained_intra_pred ? "CONSTR" : "",
7422                pps->redundant_pic_cnt_present ? "REDU" : "",
7423                pps->transform_8x8_mode ? "8x8DCT" : ""
7424                );
7425     }
7426
7427     av_free(h->pps_buffers[pps_id]);
7428     h->pps_buffers[pps_id]= pps;
7429     return 0;
7430 fail:
7431     av_free(pps);
7432     return -1;
7433 }
7434
7435 /**
7436  * Call decode_slice() for each context.
7437  *
7438  * @param h h264 master context
7439  * @param context_count number of contexts to execute
7440  */
7441 static void execute_decode_slices(H264Context *h, int context_count){
7442     MpegEncContext * const s = &h->s;
7443     AVCodecContext * const avctx= s->avctx;
7444     H264Context *hx;
7445     int i;
7446
7447     if (s->avctx->hwaccel)
7448         return;
7449     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7450         return;
7451     if(context_count == 1) {
7452         decode_slice(avctx, &h);
7453     } else {
7454         for(i = 1; i < context_count; i++) {
7455             hx = h->thread_context[i];
7456             hx->s.error_recognition = avctx->error_recognition;
7457             hx->s.error_count = 0;
7458         }
7459
7460         avctx->execute(avctx, (void *)decode_slice,
7461                        h->thread_context, NULL, context_count, sizeof(void*));
7462
7463         /* pull back stuff from slices to master context */
7464         hx = h->thread_context[context_count - 1];
7465         s->mb_x = hx->s.mb_x;
7466         s->mb_y = hx->s.mb_y;
7467         s->dropable = hx->s.dropable;
7468         s->picture_structure = hx->s.picture_structure;
7469         for(i = 1; i < context_count; i++)
7470             h->s.error_count += h->thread_context[i]->s.error_count;
7471     }
7472 }
7473
7474
7475 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7476     MpegEncContext * const s = &h->s;
7477     AVCodecContext * const avctx= s->avctx;
7478     int buf_index=0;
7479     H264Context *hx; ///< thread context
7480     int context_count = 0;
7481     int next_avc= h->is_avc ? 0 : buf_size;
7482
7483     h->max_contexts = avctx->thread_count;
7484 #if 0
7485     int i;
7486     for(i=0; i<50; i++){
7487         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7488     }
7489 #endif
7490     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7491         h->current_slice = 0;
7492         if (!s->first_field)
7493             s->current_picture_ptr= NULL;
7494         reset_sei(h);
7495     }
7496
7497     for(;;){
7498         int consumed;
7499         int dst_length;
7500         int bit_length;
7501         const uint8_t *ptr;
7502         int i, nalsize = 0;
7503         int err;
7504
7505         if(buf_index >= next_avc) {
7506             if(buf_index >= buf_size) break;
7507             nalsize = 0;
7508             for(i = 0; i < h->nal_length_size; i++)
7509                 nalsize = (nalsize << 8) | buf[buf_index++];
7510             if(nalsize <= 1 || nalsize > buf_size - buf_index){
7511                 if(nalsize == 1){
7512                     buf_index++;
7513                     continue;
7514                 }else{
7515                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7516                     break;
7517                 }
7518             }
7519             next_avc= buf_index + nalsize;
7520         } else {
7521             // start code prefix search
7522             for(; buf_index + 3 < buf_size; buf_index++){
7523                 // This should always succeed in the first iteration.
7524                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7525                     break;
7526             }
7527
7528             if(buf_index+3 >= buf_size) break;
7529
7530             buf_index+=3;
7531         }
7532
7533         hx = h->thread_context[context_count];
7534
7535         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7536         if (ptr==NULL || dst_length < 0){
7537             return -1;
7538         }
7539         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7540             dst_length--;
7541         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7542
7543         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7544             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7545         }
7546
7547         if (h->is_avc && (nalsize != consumed) && nalsize){
7548             int i, debug_level = AV_LOG_DEBUG;
7549             for (i = consumed; i < nalsize; i++)
7550                 if (buf[buf_index+i])
7551                     debug_level = AV_LOG_ERROR;
7552             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7553         }
7554
7555         buf_index += consumed;
7556
7557         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7558            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7559             continue;
7560
7561       again:
7562         err = 0;
7563         switch(hx->nal_unit_type){
7564         case NAL_IDR_SLICE:
7565             if (h->nal_unit_type != NAL_IDR_SLICE) {
7566                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7567                 return -1;
7568             }
7569             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7570         case NAL_SLICE:
7571             init_get_bits(&hx->s.gb, ptr, bit_length);
7572             hx->intra_gb_ptr=
7573             hx->inter_gb_ptr= &hx->s.gb;
7574             hx->s.data_partitioning = 0;
7575
7576             if((err = decode_slice_header(hx, h)))
7577                break;
7578
7579             if (s->avctx->hwaccel && h->current_slice == 1) {
7580                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7581                     return -1;
7582             }
7583
7584             s->current_picture_ptr->key_frame |=
7585                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7586                     (h->sei_recovery_frame_cnt >= 0);
7587             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7588                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7589                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7590                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7591                && avctx->skip_frame < AVDISCARD_ALL){
7592                 if(avctx->hwaccel) {
7593                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7594                         return -1;
7595                 }else
7596                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7597                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7598                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7599                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7600                 }else
7601                     context_count++;
7602             }
7603             break;
7604         case NAL_DPA:
7605             init_get_bits(&hx->s.gb, ptr, bit_length);
7606             hx->intra_gb_ptr=
7607             hx->inter_gb_ptr= NULL;
7608
7609             if ((err = decode_slice_header(hx, h)) < 0)
7610                 break;
7611
7612             hx->s.data_partitioning = 1;
7613
7614             break;
7615         case NAL_DPB:
7616             init_get_bits(&hx->intra_gb, ptr, bit_length);
7617             hx->intra_gb_ptr= &hx->intra_gb;
7618             break;
7619         case NAL_DPC:
7620             init_get_bits(&hx->inter_gb, ptr, bit_length);
7621             hx->inter_gb_ptr= &hx->inter_gb;
7622
7623             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7624                && s->context_initialized
7625                && s->hurry_up < 5
7626                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7627                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7628                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7629                && avctx->skip_frame < AVDISCARD_ALL)
7630                 context_count++;
7631             break;
7632         case NAL_SEI:
7633             init_get_bits(&s->gb, ptr, bit_length);
7634             ff_h264_decode_sei(h);
7635             break;
7636         case NAL_SPS:
7637             init_get_bits(&s->gb, ptr, bit_length);
7638             ff_h264_decode_seq_parameter_set(h);
7639
7640             if(s->flags& CODEC_FLAG_LOW_DELAY)
7641                 s->low_delay=1;
7642
7643             if(avctx->has_b_frames < 2)
7644                 avctx->has_b_frames= !s->low_delay;
7645             break;
7646         case NAL_PPS:
7647             init_get_bits(&s->gb, ptr, bit_length);
7648
7649             ff_h264_decode_picture_parameter_set(h, bit_length);
7650
7651             break;
7652         case NAL_AUD:
7653         case NAL_END_SEQUENCE:
7654         case NAL_END_STREAM:
7655         case NAL_FILLER_DATA:
7656         case NAL_SPS_EXT:
7657         case NAL_AUXILIARY_SLICE:
7658             break;
7659         default:
7660             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7661         }
7662
7663         if(context_count == h->max_contexts) {
7664             execute_decode_slices(h, context_count);
7665             context_count = 0;
7666         }
7667
7668         if (err < 0)
7669             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7670         else if(err == 1) {
7671             /* Slice could not be decoded in parallel mode, copy down
7672              * NAL unit stuff to context 0 and restart. Note that
7673              * rbsp_buffer is not transferred, but since we no longer
7674              * run in parallel mode this should not be an issue. */
7675             h->nal_unit_type = hx->nal_unit_type;
7676             h->nal_ref_idc   = hx->nal_ref_idc;
7677             hx = h;
7678             goto again;
7679         }
7680     }
7681     if(context_count)
7682         execute_decode_slices(h, context_count);
7683     return buf_index;
7684 }
7685
7686 /**
7687  * returns the number of bytes consumed for building the current frame
7688  */
7689 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7690         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7691         if(pos+10>buf_size) pos=buf_size; // oops ;)
7692
7693         return pos;
7694 }
7695
7696 static int decode_frame(AVCodecContext *avctx,
7697                              void *data, int *data_size,
7698                              AVPacket *avpkt)
7699 {
7700     const uint8_t *buf = avpkt->data;
7701     int buf_size = avpkt->size;
7702     H264Context *h = avctx->priv_data;
7703     MpegEncContext *s = &h->s;
7704     AVFrame *pict = data;
7705     int buf_index;
7706
7707     s->flags= avctx->flags;
7708     s->flags2= avctx->flags2;
7709
7710    /* end of stream, output what is still in the buffers */
7711     if (buf_size == 0) {
7712         Picture *out;
7713         int i, out_idx;
7714
7715 //FIXME factorize this with the output code below
7716         out = h->delayed_pic[0];
7717         out_idx = 0;
7718         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7719             if(h->delayed_pic[i]->poc < out->poc){
7720                 out = h->delayed_pic[i];
7721                 out_idx = i;
7722             }
7723
7724         for(i=out_idx; h->delayed_pic[i]; i++)
7725             h->delayed_pic[i] = h->delayed_pic[i+1];
7726
7727         if(out){
7728             *data_size = sizeof(AVFrame);
7729             *pict= *(AVFrame*)out;
7730         }
7731
7732         return 0;
7733     }
7734
7735     if(h->is_avc && !h->got_avcC) {
7736         int i, cnt, nalsize;
7737         unsigned char *p = avctx->extradata;
7738         if(avctx->extradata_size < 7) {
7739             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7740             return -1;
7741         }
7742         if(*p != 1) {
7743             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7744             return -1;
7745         }
7746         /* sps and pps in the avcC always have length coded with 2 bytes,
7747            so put a fake nal_length_size = 2 while parsing them */
7748         h->nal_length_size = 2;
7749         // Decode sps from avcC
7750         cnt = *(p+5) & 0x1f; // Number of sps
7751         p += 6;
7752         for (i = 0; i < cnt; i++) {
7753             nalsize = AV_RB16(p) + 2;
7754             if(decode_nal_units(h, p, nalsize) < 0) {
7755                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7756                 return -1;
7757             }
7758             p += nalsize;
7759         }
7760         // Decode pps from avcC
7761         cnt = *(p++); // Number of pps
7762         for (i = 0; i < cnt; i++) {
7763             nalsize = AV_RB16(p) + 2;
7764             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7765                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7766                 return -1;
7767             }
7768             p += nalsize;
7769         }
7770         // Now store right nal length size, that will be use to parse all other nals
7771         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7772         // Do not reparse avcC
7773         h->got_avcC = 1;
7774     }
7775
7776     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7777         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7778             return -1;
7779         h->got_avcC = 1;
7780     }
7781
7782     buf_index=decode_nal_units(h, buf, buf_size);
7783     if(buf_index < 0)
7784         return -1;
7785
7786     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7787         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7788         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7789         return -1;
7790     }
7791
7792     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7793         Picture *out = s->current_picture_ptr;
7794         Picture *cur = s->current_picture_ptr;
7795         int i, pics, out_of_order, out_idx;
7796
7797         field_end(h);
7798
7799         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7800             /* Wait for second field. */
7801             *data_size = 0;
7802
7803         } else {
7804             cur->interlaced_frame = 0;
7805             cur->repeat_pict = 0;
7806
7807             /* Signal interlacing information externally. */
7808             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7809
7810             if(h->sps.pic_struct_present_flag){
7811                 switch (h->sei_pic_struct)
7812                 {
7813                 case SEI_PIC_STRUCT_FRAME:
7814                     break;
7815                 case SEI_PIC_STRUCT_TOP_FIELD:
7816                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7817                     cur->interlaced_frame = 1;
7818                     break;
7819                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7820                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7821                     if (FIELD_OR_MBAFF_PICTURE)
7822                         cur->interlaced_frame = 1;
7823                     else
7824                         // try to flag soft telecine progressive
7825                         cur->interlaced_frame = h->prev_interlaced_frame;
7826                     break;
7827                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7828                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7829                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7830                     // From these hints, let the applications decide if they apply deinterlacing.
7831                     cur->repeat_pict = 1;
7832                     break;
7833                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7834                     // Force progressive here, as doubling interlaced frame is a bad idea.
7835                     cur->repeat_pict = 2;
7836                     break;
7837                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7838                     cur->repeat_pict = 4;
7839                     break;
7840                 }
7841
7842                 if ((h->sei_ct_type & 3) && h->sei_pic_struct <= SEI_PIC_STRUCT_BOTTOM_TOP)
7843                     cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7844             }else{
7845                 /* Derive interlacing flag from used decoding process. */
7846                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7847             }
7848             h->prev_interlaced_frame = cur->interlaced_frame;
7849
7850             if (cur->field_poc[0] != cur->field_poc[1]){
7851                 /* Derive top_field_first from field pocs. */
7852                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7853             }else{
7854                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7855                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7856                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7857                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7858                         cur->top_field_first = 1;
7859                     else
7860                         cur->top_field_first = 0;
7861                 }else{
7862                     /* Most likely progressive */
7863                     cur->top_field_first = 0;
7864                 }
7865             }
7866
7867         //FIXME do something with unavailable reference frames
7868
7869             /* Sort B-frames into display order */
7870
7871             if(h->sps.bitstream_restriction_flag
7872                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7873                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7874                 s->low_delay = 0;
7875             }
7876
7877             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7878                && !h->sps.bitstream_restriction_flag){
7879                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7880                 s->low_delay= 0;
7881             }
7882
7883             pics = 0;
7884             while(h->delayed_pic[pics]) pics++;
7885
7886             assert(pics <= MAX_DELAYED_PIC_COUNT);
7887
7888             h->delayed_pic[pics++] = cur;
7889             if(cur->reference == 0)
7890                 cur->reference = DELAYED_PIC_REF;
7891
7892             out = h->delayed_pic[0];
7893             out_idx = 0;
7894             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7895                 if(h->delayed_pic[i]->poc < out->poc){
7896                     out = h->delayed_pic[i];
7897                     out_idx = i;
7898                 }
7899             if(s->avctx->has_b_frames == 0 && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset))
7900                 h->outputed_poc= INT_MIN;
7901             out_of_order = out->poc < h->outputed_poc;
7902
7903             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7904                 { }
7905             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7906                || (s->low_delay &&
7907                 ((h->outputed_poc != INT_MIN && out->poc > h->outputed_poc + 2)
7908                  || cur->pict_type == FF_B_TYPE)))
7909             {
7910                 s->low_delay = 0;
7911                 s->avctx->has_b_frames++;
7912             }
7913
7914             if(out_of_order || pics > s->avctx->has_b_frames){
7915                 out->reference &= ~DELAYED_PIC_REF;
7916                 for(i=out_idx; h->delayed_pic[i]; i++)
7917                     h->delayed_pic[i] = h->delayed_pic[i+1];
7918             }
7919             if(!out_of_order && pics > s->avctx->has_b_frames){
7920                 *data_size = sizeof(AVFrame);
7921
7922                 if(out_idx==0 && h->delayed_pic[0] && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset)) {
7923                     h->outputed_poc = INT_MIN;
7924                 } else
7925                 h->outputed_poc = out->poc;
7926                 *pict= *(AVFrame*)out;
7927             }else{
7928                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7929             }
7930         }
7931     }
7932
7933     assert(pict->data[0] || !*data_size);
7934     ff_print_debug_info(s, pict);
7935 //printf("out %d\n", (int)pict->data[0]);
7936
7937     return get_consumed_bytes(s, buf_index, buf_size);
7938 }
7939 #if 0
7940 static inline void fill_mb_avail(H264Context *h){
7941     MpegEncContext * const s = &h->s;
7942     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7943
7944     if(s->mb_y){
7945         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7946         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7947         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7948     }else{
7949         h->mb_avail[0]=
7950         h->mb_avail[1]=
7951         h->mb_avail[2]= 0;
7952     }
7953     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7954     h->mb_avail[4]= 1; //FIXME move out
7955     h->mb_avail[5]= 0; //FIXME move out
7956 }
7957 #endif
7958
7959 #ifdef TEST
7960 #undef printf
7961 #undef random
7962 #define COUNT 8000
7963 #define SIZE (COUNT*40)
7964 int main(void){
7965     int i;
7966     uint8_t temp[SIZE];
7967     PutBitContext pb;
7968     GetBitContext gb;
7969 //    int int_temp[10000];
7970     DSPContext dsp;
7971     AVCodecContext avctx;
7972
7973     dsputil_init(&dsp, &avctx);
7974
7975     init_put_bits(&pb, temp, SIZE);
7976     printf("testing unsigned exp golomb\n");
7977     for(i=0; i<COUNT; i++){
7978         START_TIMER
7979         set_ue_golomb(&pb, i);
7980         STOP_TIMER("set_ue_golomb");
7981     }
7982     flush_put_bits(&pb);
7983
7984     init_get_bits(&gb, temp, 8*SIZE);
7985     for(i=0; i<COUNT; i++){
7986         int j, s;
7987
7988         s= show_bits(&gb, 24);
7989
7990         START_TIMER
7991         j= get_ue_golomb(&gb);
7992         if(j != i){
7993             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7994 //            return -1;
7995         }
7996         STOP_TIMER("get_ue_golomb");
7997     }
7998
7999
8000     init_put_bits(&pb, temp, SIZE);
8001     printf("testing signed exp golomb\n");
8002     for(i=0; i<COUNT; i++){
8003         START_TIMER
8004         set_se_golomb(&pb, i - COUNT/2);
8005         STOP_TIMER("set_se_golomb");
8006     }
8007     flush_put_bits(&pb);
8008
8009     init_get_bits(&gb, temp, 8*SIZE);
8010     for(i=0; i<COUNT; i++){
8011         int j, s;
8012
8013         s= show_bits(&gb, 24);
8014
8015         START_TIMER
8016         j= get_se_golomb(&gb);
8017         if(j != i - COUNT/2){
8018             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8019 //            return -1;
8020         }
8021         STOP_TIMER("get_se_golomb");
8022     }
8023
8024 #if 0
8025     printf("testing 4x4 (I)DCT\n");
8026
8027     DCTELEM block[16];
8028     uint8_t src[16], ref[16];
8029     uint64_t error= 0, max_error=0;
8030
8031     for(i=0; i<COUNT; i++){
8032         int j;
8033 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8034         for(j=0; j<16; j++){
8035             ref[j]= random()%255;
8036             src[j]= random()%255;
8037         }
8038
8039         h264_diff_dct_c(block, src, ref, 4);
8040
8041         //normalize
8042         for(j=0; j<16; j++){
8043 //            printf("%d ", block[j]);
8044             block[j]= block[j]*4;
8045             if(j&1) block[j]= (block[j]*4 + 2)/5;
8046             if(j&4) block[j]= (block[j]*4 + 2)/5;
8047         }
8048 //        printf("\n");
8049
8050         s->dsp.h264_idct_add(ref, block, 4);
8051 /*        for(j=0; j<16; j++){
8052             printf("%d ", ref[j]);
8053         }
8054         printf("\n");*/
8055
8056         for(j=0; j<16; j++){
8057             int diff= FFABS(src[j] - ref[j]);
8058
8059             error+= diff*diff;
8060             max_error= FFMAX(max_error, diff);
8061         }
8062     }
8063     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8064     printf("testing quantizer\n");
8065     for(qp=0; qp<52; qp++){
8066         for(i=0; i<16; i++)
8067             src1_block[i]= src2_block[i]= random()%255;
8068
8069     }
8070     printf("Testing NAL layer\n");
8071
8072     uint8_t bitstream[COUNT];
8073     uint8_t nal[COUNT*2];
8074     H264Context h;
8075     memset(&h, 0, sizeof(H264Context));
8076
8077     for(i=0; i<COUNT; i++){
8078         int zeros= i;
8079         int nal_length;
8080         int consumed;
8081         int out_length;
8082         uint8_t *out;
8083         int j;
8084
8085         for(j=0; j<COUNT; j++){
8086             bitstream[j]= (random() % 255) + 1;
8087         }
8088
8089         for(j=0; j<zeros; j++){
8090             int pos= random() % COUNT;
8091             while(bitstream[pos] == 0){
8092                 pos++;
8093                 pos %= COUNT;
8094             }
8095             bitstream[pos]=0;
8096         }
8097
8098         START_TIMER
8099
8100         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8101         if(nal_length<0){
8102             printf("encoding failed\n");
8103             return -1;
8104         }
8105
8106         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8107
8108         STOP_TIMER("NAL")
8109
8110         if(out_length != COUNT){
8111             printf("incorrect length %d %d\n", out_length, COUNT);
8112             return -1;
8113         }
8114
8115         if(consumed != nal_length){
8116             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8117             return -1;
8118         }
8119
8120         if(memcmp(bitstream, out, COUNT)){
8121             printf("mismatch\n");
8122             return -1;
8123         }
8124     }
8125 #endif
8126
8127     printf("Testing RBSP\n");
8128
8129
8130     return 0;
8131 }
8132 #endif /* TEST */
8133
8134
8135 av_cold void ff_h264_free_context(H264Context *h)
8136 {
8137     int i;
8138
8139     free_tables(h); //FIXME cleanup init stuff perhaps
8140
8141     for(i = 0; i < MAX_SPS_COUNT; i++)
8142         av_freep(h->sps_buffers + i);
8143
8144     for(i = 0; i < MAX_PPS_COUNT; i++)
8145         av_freep(h->pps_buffers + i);
8146 }
8147
8148 static av_cold int decode_end(AVCodecContext *avctx)
8149 {
8150     H264Context *h = avctx->priv_data;
8151     MpegEncContext *s = &h->s;
8152
8153     ff_h264_free_context(h);
8154
8155     MPV_common_end(s);
8156
8157 //    memset(h, 0, sizeof(H264Context));
8158
8159     return 0;
8160 }
8161
8162
8163 AVCodec h264_decoder = {
8164     "h264",
8165     CODEC_TYPE_VIDEO,
8166     CODEC_ID_H264,
8167     sizeof(H264Context),
8168     decode_init,
8169     NULL,
8170     decode_end,
8171     decode_frame,
8172     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8173     .flush= flush_dpb,
8174     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8175     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8176 };
8177
8178 #if CONFIG_H264_VDPAU_DECODER
8179 AVCodec h264_vdpau_decoder = {
8180     "h264_vdpau",
8181     CODEC_TYPE_VIDEO,
8182     CODEC_ID_H264,
8183     sizeof(H264Context),
8184     decode_init,
8185     NULL,
8186     decode_end,
8187     decode_frame,
8188     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8189     .flush= flush_dpb,
8190     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8191     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_VDPAU_H264, PIX_FMT_NONE},
8192 };
8193 #endif
8194
8195 #if CONFIG_SVQ3_DECODER
8196 #include "svq3.c"
8197 #endif