git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #if HAVE_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000         av_freep(&hx->rbsp_buffer[1]);
2001         av_freep(&hx->rbsp_buffer[0]);
2002         hx->rbsp_buffer_size[0] = 0;
2003         hx->rbsp_buffer_size[1] = 0;
2004         if (i) av_freep(&h->thread_context[i]);
2005     }
2006 }
2007
2008 static void init_dequant8_coeff_table(H264Context *h){
2009     int i,q,x;
2010     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2011     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2012     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2013
2014     for(i=0; i<2; i++ ){
2015         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2016             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2017             break;
2018         }
2019
2020         for(q=0; q<52; q++){
2021             int shift = div6[q];
2022             int idx = rem6[q];
2023             for(x=0; x<64; x++)
2024                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2025                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2026                     h->pps.scaling_matrix8[i][x]) << shift;
2027         }
2028     }
2029 }
2030
2031 static void init_dequant4_coeff_table(H264Context *h){
2032     int i,j,q,x;
2033     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2034     for(i=0; i<6; i++ ){
2035         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2036         for(j=0; j<i; j++){
2037             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2038                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2039                 break;
2040             }
2041         }
2042         if(j<i)
2043             continue;
2044
2045         for(q=0; q<52; q++){
2046             int shift = div6[q] + 2;
2047             int idx = rem6[q];
2048             for(x=0; x<16; x++)
2049                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2050                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2051                     h->pps.scaling_matrix4[i][x]) << shift;
2052         }
2053     }
2054 }
2055
2056 static void init_dequant_tables(H264Context *h){
2057     int i,x;
2058     init_dequant4_coeff_table(h);
2059     if(h->pps.transform_8x8_mode)
2060         init_dequant8_coeff_table(h);
2061     if(h->sps.transform_bypass){
2062         for(i=0; i<6; i++)
2063             for(x=0; x<16; x++)
2064                 h->dequant4_coeff[i][0][x] = 1<<6;
2065         if(h->pps.transform_8x8_mode)
2066             for(i=0; i<2; i++)
2067                 for(x=0; x<64; x++)
2068                     h->dequant8_coeff[i][0][x] = 1<<6;
2069     }
2070 }
2071
2072
2073 /**
2074  * allocates tables.
2075  * needs width/height
2076  */
2077 static int alloc_tables(H264Context *h){
2078     MpegEncContext * const s = &h->s;
2079     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2080     int x,y;
2081
2082     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t), fail)
2083
2084     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t), fail)
2085     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
2086     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
2087
2088     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
2089     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
2090     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
2091     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
2092
2093     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2094     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2095
2096     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy  , big_mb_num * sizeof(uint32_t), fail);
2097     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b8_xy , big_mb_num * sizeof(uint32_t), fail);
2098     for(y=0; y<s->mb_height; y++){
2099         for(x=0; x<s->mb_width; x++){
2100             const int mb_xy= x + y*s->mb_stride;
2101             const int b_xy = 4*x + 4*y*h->b_stride;
2102             const int b8_xy= 2*x + 2*y*h->b8_stride;
2103
2104             h->mb2b_xy [mb_xy]= b_xy;
2105             h->mb2b8_xy[mb_xy]= b8_xy;
2106         }
2107     }
2108
2109     s->obmc_scratchpad = NULL;
2110
2111     if(!h->dequant4_coeff[0])
2112         init_dequant_tables(h);
2113
2114     return 0;
2115 fail:
2116     free_tables(h);
2117     return -1;
2118 }
2119
2120 /**
2121  * Mimic alloc_tables(), but for every context thread.
2122  */
2123 static void clone_tables(H264Context *dst, H264Context *src){
2124     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2125     dst->non_zero_count           = src->non_zero_count;
2126     dst->slice_table              = src->slice_table;
2127     dst->cbp_table                = src->cbp_table;
2128     dst->mb2b_xy                  = src->mb2b_xy;
2129     dst->mb2b8_xy                 = src->mb2b8_xy;
2130     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2131     dst->mvd_table[0]             = src->mvd_table[0];
2132     dst->mvd_table[1]             = src->mvd_table[1];
2133     dst->direct_table             = src->direct_table;
2134
2135     dst->s.obmc_scratchpad = NULL;
2136     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2137 }
2138
2139 /**
2140  * Init context
2141  * Allocate buffers which are not shared amongst multiple threads.
2142  */
2143 static int context_init(H264Context *h){
2144     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2145     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2146
2147     return 0;
2148 fail:
2149     return -1; // free_tables will clean up for us
2150 }
2151
2152 static av_cold void common_init(H264Context *h){
2153     MpegEncContext * const s = &h->s;
2154
2155     s->width = s->avctx->width;
2156     s->height = s->avctx->height;
2157     s->codec_id= s->avctx->codec->id;
2158
2159     ff_h264_pred_init(&h->hpc, s->codec_id);
2160
2161     h->dequant_coeff_pps= -1;
2162     s->unrestricted_mv=1;
2163     s->decode=1; //FIXME
2164
2165     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2166
2167     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2168     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2169 }
2170
2171 /**
2172  * Reset SEI values at the beginning of the frame.
2173  *
2174  * @param h H.264 context.
2175  */
2176 static void reset_sei(H264Context *h) {
2177     h->sei_recovery_frame_cnt       = -1;
2178     h->sei_dpb_output_delay         =  0;
2179     h->sei_cpb_removal_delay        = -1;
2180     h->sei_buffering_period_present =  0;
2181 }
2182
2183 static av_cold int decode_init(AVCodecContext *avctx){
2184     H264Context *h= avctx->priv_data;
2185     MpegEncContext * const s = &h->s;
2186
2187     MPV_decode_defaults(s);
2188
2189     s->avctx = avctx;
2190     common_init(h);
2191
2192     s->out_format = FMT_H264;
2193     s->workaround_bugs= avctx->workaround_bugs;
2194
2195     // set defaults
2196 //    s->decode_mb= ff_h263_decode_mb;
2197     s->quarter_sample = 1;
2198     if(!avctx->has_b_frames)
2199     s->low_delay= 1;
2200
2201     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
2202
2203     decode_init_vlc();
2204
2205     if(avctx->extradata_size > 0 && avctx->extradata &&
2206        *(char *)avctx->extradata == 1){
2207         h->is_avc = 1;
2208         h->got_avcC = 0;
2209     } else {
2210         h->is_avc = 0;
2211     }
2212
2213     h->thread_context[0] = h;
2214     h->outputed_poc = INT_MIN;
2215     h->prev_poc_msb= 1<<16;
2216     reset_sei(h);
2217     if(avctx->codec_id == CODEC_ID_H264){
2218         if(avctx->ticks_per_frame == 1){
2219             s->avctx->time_base.den *=2;
2220         }
2221         avctx->ticks_per_frame = 2;
2222     }
2223     return 0;
2224 }
2225
2226 static int frame_start(H264Context *h){
2227     MpegEncContext * const s = &h->s;
2228     int i;
2229
2230     if(MPV_frame_start(s, s->avctx) < 0)
2231         return -1;
2232     ff_er_frame_start(s);
2233     /*
2234      * MPV_frame_start uses pict_type to derive key_frame.
2235      * This is incorrect for H.264; IDR markings must be used.
2236      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2237      * See decode_nal_units().
2238      */
2239     s->current_picture_ptr->key_frame= 0;
2240     s->current_picture_ptr->mmco_reset= 0;
2241
2242     assert(s->linesize && s->uvlinesize);
2243
2244     for(i=0; i<16; i++){
2245         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2246         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2247     }
2248     for(i=0; i<4; i++){
2249         h->block_offset[16+i]=
2250         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2251         h->block_offset[24+16+i]=
2252         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2253     }
2254
2255     /* can't be in alloc_tables because linesize isn't known there.
2256      * FIXME: redo bipred weight to not require extra buffer? */
2257     for(i = 0; i < s->avctx->thread_count; i++)
2258         if(!h->thread_context[i]->s.obmc_scratchpad)
2259             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2260
2261     /* some macroblocks will be accessed before they're available */
2262     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2263         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2264
2265 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2266
2267     // We mark the current picture as non-reference after allocating it, so
2268     // that if we break out due to an error it can be released automatically
2269     // in the next MPV_frame_start().
2270     // SVQ3 as well as most other codecs have only last/next/current and thus
2271     // get released even with set reference, besides SVQ3 and others do not
2272     // mark frames as reference later "naturally".
2273     if(s->codec_id != CODEC_ID_SVQ3)
2274         s->current_picture_ptr->reference= 0;
2275
2276     s->current_picture_ptr->field_poc[0]=
2277     s->current_picture_ptr->field_poc[1]= INT_MAX;
2278     assert(s->current_picture_ptr->long_ref==0);
2279
2280     return 0;
2281 }
2282
2283 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2284     MpegEncContext * const s = &h->s;
2285     int i;
2286     int step    = 1;
2287     int offset  = 1;
2288     int uvoffset= 1;
2289     int top_idx = 1;
2290     int skiplast= 0;
2291
2292     src_y  -=   linesize;
2293     src_cb -= uvlinesize;
2294     src_cr -= uvlinesize;
2295
2296     if(!simple && FRAME_MBAFF){
2297         if(s->mb_y&1){
2298             offset  = MB_MBAFF ? 1 : 17;
2299             uvoffset= MB_MBAFF ? 1 : 9;
2300             if(!MB_MBAFF){
2301                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2302                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2303                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2304                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2305                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2306                 }
2307             }
2308         }else{
2309             if(!MB_MBAFF){
2310                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2311                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2312                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2313                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2314                 }
2315                 skiplast= 1;
2316             }
2317             offset  =
2318             uvoffset=
2319             top_idx = MB_MBAFF ? 0 : 1;
2320         }
2321         step= MB_MBAFF ? 2 : 1;
2322     }
2323
2324     // There are two lines saved, the line above the the top macroblock of a pair,
2325     // and the line above the bottom macroblock
2326     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2327     for(i=1; i<17 - skiplast; i++){
2328         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2329     }
2330
2331     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2332     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2333
2334     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2335         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2336         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2337         for(i=1; i<9 - skiplast; i++){
2338             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2339             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2340         }
2341         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2342         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2343     }
2344 }
2345
2346 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2347     MpegEncContext * const s = &h->s;
2348     int temp8, i;
2349     uint64_t temp64;
2350     int deblock_left;
2351     int deblock_top;
2352     int mb_xy;
2353     int step    = 1;
2354     int offset  = 1;
2355     int uvoffset= 1;
2356     int top_idx = 1;
2357
2358     if(!simple && FRAME_MBAFF){
2359         if(s->mb_y&1){
2360             offset  = MB_MBAFF ? 1 : 17;
2361             uvoffset= MB_MBAFF ? 1 : 9;
2362         }else{
2363             offset  =
2364             uvoffset=
2365             top_idx = MB_MBAFF ? 0 : 1;
2366         }
2367         step= MB_MBAFF ? 2 : 1;
2368     }
2369
2370     if(h->deblocking_filter == 2) {
2371         mb_xy = h->mb_xy;
2372         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2373         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2374     } else {
2375         deblock_left = (s->mb_x > 0);
2376         deblock_top =  (s->mb_y > !!MB_FIELD);
2377     }
2378
2379     src_y  -=   linesize + 1;
2380     src_cb -= uvlinesize + 1;
2381     src_cr -= uvlinesize + 1;
2382
2383 #define XCHG(a,b,t,xchg)\
2384 t= a;\
2385 if(xchg)\
2386     a= b;\
2387 b= t;
2388
2389     if(deblock_left){
2390         for(i = !deblock_top; i<16; i++){
2391             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2392         }
2393         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2394     }
2395
2396     if(deblock_top){
2397         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2398         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2399         if(s->mb_x+1 < s->mb_width){
2400             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2401         }
2402     }
2403
2404     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2405         if(deblock_left){
2406             for(i = !deblock_top; i<8; i++){
2407                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2408                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2409             }
2410             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2411             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2412         }
2413         if(deblock_top){
2414             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2415             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2416         }
2417     }
2418 }
2419
2420 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2421     MpegEncContext * const s = &h->s;
2422     const int mb_x= s->mb_x;
2423     const int mb_y= s->mb_y;
2424     const int mb_xy= h->mb_xy;
2425     const int mb_type= s->current_picture.mb_type[mb_xy];
2426     uint8_t  *dest_y, *dest_cb, *dest_cr;
2427     int linesize, uvlinesize /*dct_offset*/;
2428     int i;
2429     int *block_offset = &h->block_offset[0];
2430     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2431     /* is_h264 should always be true if SVQ3 is disabled. */
2432     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2433     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2434     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2435
2436     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2437     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2438     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2439
2440     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2441     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2442
2443     if (!simple && MB_FIELD) {
2444         linesize   = h->mb_linesize   = s->linesize * 2;
2445         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2446         block_offset = &h->block_offset[24];
2447         if(mb_y&1){ //FIXME move out of this function?
2448             dest_y -= s->linesize*15;
2449             dest_cb-= s->uvlinesize*7;
2450             dest_cr-= s->uvlinesize*7;
2451         }
2452         if(FRAME_MBAFF) {
2453             int list;
2454             for(list=0; list<h->list_count; list++){
2455                 if(!USES_LIST(mb_type, list))
2456                     continue;
2457                 if(IS_16X16(mb_type)){
2458                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2459                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2460                 }else{
2461                     for(i=0; i<16; i+=4){
2462                         int ref = h->ref_cache[list][scan8[i]];
2463                         if(ref >= 0)
2464                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2465                     }
2466                 }
2467             }
2468         }
2469     } else {
2470         linesize   = h->mb_linesize   = s->linesize;
2471         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2472 //        dct_offset = s->linesize * 16;
2473     }
2474
2475     if (!simple && IS_INTRA_PCM(mb_type)) {
2476         for (i=0; i<16; i++) {
2477             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2478         }
2479         for (i=0; i<8; i++) {
2480             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2481             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2482         }
2483     } else {
2484         if(IS_INTRA(mb_type)){
2485             if(h->deblocking_filter)
2486                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2487
2488             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2489                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2490                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2491             }
2492
2493             if(IS_INTRA4x4(mb_type)){
2494                 if(simple || !s->encoding){
2495                     if(IS_8x8DCT(mb_type)){
2496                         if(transform_bypass){
2497                             idct_dc_add =
2498                             idct_add    = s->dsp.add_pixels8;
2499                         }else{
2500                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2501                             idct_add    = s->dsp.h264_idct8_add;
2502                         }
2503                         for(i=0; i<16; i+=4){
2504                             uint8_t * const ptr= dest_y + block_offset[i];
2505                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2506                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2507                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2508                             }else{
2509                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2510                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2511                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2512                                 if(nnz){
2513                                     if(nnz == 1 && h->mb[i*16])
2514                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2515                                     else
2516                                         idct_add   (ptr, h->mb + i*16, linesize);
2517                                 }
2518                             }
2519                         }
2520                     }else{
2521                         if(transform_bypass){
2522                             idct_dc_add =
2523                             idct_add    = s->dsp.add_pixels4;
2524                         }else{
2525                             idct_dc_add = s->dsp.h264_idct_dc_add;
2526                             idct_add    = s->dsp.h264_idct_add;
2527                         }
2528                         for(i=0; i<16; i++){
2529                             uint8_t * const ptr= dest_y + block_offset[i];
2530                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2531
2532                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2533                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2534                             }else{
2535                                 uint8_t *topright;
2536                                 int nnz, tr;
2537                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2538                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2539                                     assert(mb_y || linesize <= block_offset[i]);
2540                                     if(!topright_avail){
2541                                         tr= ptr[3 - linesize]*0x01010101;
2542                                         topright= (uint8_t*) &tr;
2543                                     }else
2544                                         topright= ptr + 4 - linesize;
2545                                 }else
2546                                     topright= NULL;
2547
2548                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2549                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2550                                 if(nnz){
2551                                     if(is_h264){
2552                                         if(nnz == 1 && h->mb[i*16])
2553                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2554                                         else
2555                                             idct_add   (ptr, h->mb + i*16, linesize);
2556                                     }else
2557                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2558                                 }
2559                             }
2560                         }
2561                     }
2562                 }
2563             }else{
2564                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2565                 if(is_h264){
2566                     if(!transform_bypass)
2567                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2568                 }else
2569                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2570             }
2571             if(h->deblocking_filter)
2572                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2573         }else if(is_h264){
2574             hl_motion(h, dest_y, dest_cb, dest_cr,
2575                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2576                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2577                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2578         }
2579
2580
2581         if(!IS_INTRA4x4(mb_type)){
2582             if(is_h264){
2583                 if(IS_INTRA16x16(mb_type)){
2584                     if(transform_bypass){
2585                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2586                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2587                         }else{
2588                             for(i=0; i<16; i++){
2589                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2590                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2591                             }
2592                         }
2593                     }else{
2594                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2595                     }
2596                 }else if(h->cbp&15){
2597                     if(transform_bypass){
2598                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2599                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2600                         for(i=0; i<16; i+=di){
2601                             if(h->non_zero_count_cache[ scan8[i] ]){
2602                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2603                             }
2604                         }
2605                     }else{
2606                         if(IS_8x8DCT(mb_type)){
2607                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2608                         }else{
2609                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2610                         }
2611                     }
2612                 }
2613             }else{
2614                 for(i=0; i<16; i++){
2615                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2616                         uint8_t * const ptr= dest_y + block_offset[i];
2617                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2618                     }
2619                 }
2620             }
2621         }
2622
2623         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2624             uint8_t *dest[2] = {dest_cb, dest_cr};
2625             if(transform_bypass){
2626                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2627                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2628                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2629                 }else{
2630                     idct_add = s->dsp.add_pixels4;
2631                     for(i=16; i<16+8; i++){
2632                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2633                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2634                     }
2635                 }
2636             }else{
2637                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2638                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2639                 if(is_h264){
2640                     idct_add = s->dsp.h264_idct_add;
2641                     idct_dc_add = s->dsp.h264_idct_dc_add;
2642                     for(i=16; i<16+8; i++){
2643                         if(h->non_zero_count_cache[ scan8[i] ])
2644                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2645                         else if(h->mb[i*16])
2646                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2647                     }
2648                 }else{
2649                     for(i=16; i<16+8; i++){
2650                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2651                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2652                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2653                         }
2654                     }
2655                 }
2656             }
2657         }
2658     }
2659     if(h->cbp || IS_INTRA(mb_type))
2660         s->dsp.clear_blocks(h->mb);
2661
2662     if(h->deblocking_filter) {
2663         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2664         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2665         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2666         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2667         if (!simple && FRAME_MBAFF) {
2668             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2669         } else {
2670             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2671         }
2672     }
2673 }
2674
2675 /**
2676  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2677  */
2678 static void hl_decode_mb_simple(H264Context *h){
2679     hl_decode_mb_internal(h, 1);
2680 }
2681
2682 /**
2683  * Process a macroblock; this handles edge cases, such as interlacing.
2684  */
2685 static void av_noinline hl_decode_mb_complex(H264Context *h){
2686     hl_decode_mb_internal(h, 0);
2687 }
2688
2689 static void hl_decode_mb(H264Context *h){
2690     MpegEncContext * const s = &h->s;
2691     const int mb_xy= h->mb_xy;
2692     const int mb_type= s->current_picture.mb_type[mb_xy];
2693     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2694
2695     if (is_complex)
2696         hl_decode_mb_complex(h);
2697     else hl_decode_mb_simple(h);
2698 }
2699
2700 static void pic_as_field(Picture *pic, const int parity){
2701     int i;
2702     for (i = 0; i < 4; ++i) {
2703         if (parity == PICT_BOTTOM_FIELD)
2704             pic->data[i] += pic->linesize[i];
2705         pic->reference = parity;
2706         pic->linesize[i] *= 2;
2707     }
2708     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2709 }
2710
2711 static int split_field_copy(Picture *dest, Picture *src,
2712                             int parity, int id_add){
2713     int match = !!(src->reference & parity);
2714
2715     if (match) {
2716         *dest = *src;
2717         if(parity != PICT_FRAME){
2718             pic_as_field(dest, parity);
2719             dest->pic_id *= 2;
2720             dest->pic_id += id_add;
2721         }
2722     }
2723
2724     return match;
2725 }
2726
2727 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2728     int i[2]={0};
2729     int index=0;
2730
2731     while(i[0]<len || i[1]<len){
2732         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2733             i[0]++;
2734         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2735             i[1]++;
2736         if(i[0] < len){
2737             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2738             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2739         }
2740         if(i[1] < len){
2741             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2742             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2743         }
2744     }
2745
2746     return index;
2747 }
2748
2749 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2750     int i, best_poc;
2751     int out_i= 0;
2752
2753     for(;;){
2754         best_poc= dir ? INT_MIN : INT_MAX;
2755
2756         for(i=0; i<len; i++){
2757             const int poc= src[i]->poc;
2758             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2759                 best_poc= poc;
2760                 sorted[out_i]= src[i];
2761             }
2762         }
2763         if(best_poc == (dir ? INT_MIN : INT_MAX))
2764             break;
2765         limit= sorted[out_i++]->poc - dir;
2766     }
2767     return out_i;
2768 }
2769
2770 /**
2771  * fills the default_ref_list.
2772  */
2773 static int fill_default_ref_list(H264Context *h){
2774     MpegEncContext * const s = &h->s;
2775     int i, len;
2776
2777     if(h->slice_type_nos==FF_B_TYPE){
2778         Picture *sorted[32];
2779         int cur_poc, list;
2780         int lens[2];
2781
2782         if(FIELD_PICTURE)
2783             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2784         else
2785             cur_poc= s->current_picture_ptr->poc;
2786
2787         for(list= 0; list<2; list++){
2788             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2789             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2790             assert(len<=32);
2791             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2792             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2793             assert(len<=32);
2794
2795             if(len < h->ref_count[list])
2796                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2797             lens[list]= len;
2798         }
2799
2800         if(lens[0] == lens[1] && lens[1] > 1){
2801             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2802             if(i == lens[0])
2803                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2804         }
2805     }else{
2806         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2807         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2808         assert(len <= 32);
2809         if(len < h->ref_count[0])
2810             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2811     }
2812 #ifdef TRACE
2813     for (i=0; i<h->ref_count[0]; i++) {
2814         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2815     }
2816     if(h->slice_type_nos==FF_B_TYPE){
2817         for (i=0; i<h->ref_count[1]; i++) {
2818             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2819         }
2820     }
2821 #endif
2822     return 0;
2823 }
2824
2825 static void print_short_term(H264Context *h);
2826 static void print_long_term(H264Context *h);
2827
2828 /**
2829  * Extract structure information about the picture described by pic_num in
2830  * the current decoding context (frame or field). Note that pic_num is
2831  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2832  * @param pic_num picture number for which to extract structure information
2833  * @param structure one of PICT_XXX describing structure of picture
2834  *                      with pic_num
2835  * @return frame number (short term) or long term index of picture
2836  *         described by pic_num
2837  */
2838 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2839     MpegEncContext * const s = &h->s;
2840
2841     *structure = s->picture_structure;
2842     if(FIELD_PICTURE){
2843         if (!(pic_num & 1))
2844             /* opposite field */
2845             *structure ^= PICT_FRAME;
2846         pic_num >>= 1;
2847     }
2848
2849     return pic_num;
2850 }
2851
2852 static int decode_ref_pic_list_reordering(H264Context *h){
2853     MpegEncContext * const s = &h->s;
2854     int list, index, pic_structure;
2855
2856     print_short_term(h);
2857     print_long_term(h);
2858
2859     for(list=0; list<h->list_count; list++){
2860         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2861
2862         if(get_bits1(&s->gb)){
2863             int pred= h->curr_pic_num;
2864
2865             for(index=0; ; index++){
2866                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2867                 unsigned int pic_id;
2868                 int i;
2869                 Picture *ref = NULL;
2870
2871                 if(reordering_of_pic_nums_idc==3)
2872                     break;
2873
2874                 if(index >= h->ref_count[list]){
2875                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2876                     return -1;
2877                 }
2878
2879                 if(reordering_of_pic_nums_idc<3){
2880                     if(reordering_of_pic_nums_idc<2){
2881                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2882                         int frame_num;
2883
2884                         if(abs_diff_pic_num > h->max_pic_num){
2885                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2886                             return -1;
2887                         }
2888
2889                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2890                         else                                pred+= abs_diff_pic_num;
2891                         pred &= h->max_pic_num - 1;
2892
2893                         frame_num = pic_num_extract(h, pred, &pic_structure);
2894
2895                         for(i= h->short_ref_count-1; i>=0; i--){
2896                             ref = h->short_ref[i];
2897                             assert(ref->reference);
2898                             assert(!ref->long_ref);
2899                             if(
2900                                    ref->frame_num == frame_num &&
2901                                    (ref->reference & pic_structure)
2902                               )
2903                                 break;
2904                         }
2905                         if(i>=0)
2906                             ref->pic_id= pred;
2907                     }else{
2908                         int long_idx;
2909                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2910
2911                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2912
2913                         if(long_idx>31){
2914                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2915                             return -1;
2916                         }
2917                         ref = h->long_ref[long_idx];
2918                         assert(!(ref && !ref->reference));
2919                         if(ref && (ref->reference & pic_structure)){
2920                             ref->pic_id= pic_id;
2921                             assert(ref->long_ref);
2922                             i=0;
2923                         }else{
2924                             i=-1;
2925                         }
2926                     }
2927
2928                     if (i < 0) {
2929                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2930                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2931                     } else {
2932                         for(i=index; i+1<h->ref_count[list]; i++){
2933                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2934                                 break;
2935                         }
2936                         for(; i > index; i--){
2937                             h->ref_list[list][i]= h->ref_list[list][i-1];
2938                         }
2939                         h->ref_list[list][index]= *ref;
2940                         if (FIELD_PICTURE){
2941                             pic_as_field(&h->ref_list[list][index], pic_structure);
2942                         }
2943                     }
2944                 }else{
2945                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2946                     return -1;
2947                 }
2948             }
2949         }
2950     }
2951     for(list=0; list<h->list_count; list++){
2952         for(index= 0; index < h->ref_count[list]; index++){
2953             if(!h->ref_list[list][index].data[0]){
2954                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2955                 if(h->default_ref_list[list][0].data[0])
2956                     h->ref_list[list][index]= h->default_ref_list[list][0];
2957                 else
2958                     return -1;
2959             }
2960         }
2961     }
2962
2963     return 0;
2964 }
2965
2966 static void fill_mbaff_ref_list(H264Context *h){
2967     int list, i, j;
2968     for(list=0; list<2; list++){ //FIXME try list_count
2969         for(i=0; i<h->ref_count[list]; i++){
2970             Picture *frame = &h->ref_list[list][i];
2971             Picture *field = &h->ref_list[list][16+2*i];
2972             field[0] = *frame;
2973             for(j=0; j<3; j++)
2974                 field[0].linesize[j] <<= 1;
2975             field[0].reference = PICT_TOP_FIELD;
2976             field[0].poc= field[0].field_poc[0];
2977             field[1] = field[0];
2978             for(j=0; j<3; j++)
2979                 field[1].data[j] += frame->linesize[j];
2980             field[1].reference = PICT_BOTTOM_FIELD;
2981             field[1].poc= field[1].field_poc[1];
2982
2983             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2984             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2985             for(j=0; j<2; j++){
2986                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2987                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2988             }
2989         }
2990     }
2991     for(j=0; j<h->ref_count[1]; j++){
2992         for(i=0; i<h->ref_count[0]; i++)
2993             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2994         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2995         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2996     }
2997 }
2998
2999 static int pred_weight_table(H264Context *h){
3000     MpegEncContext * const s = &h->s;
3001     int list, i;
3002     int luma_def, chroma_def;
3003
3004     h->use_weight= 0;
3005     h->use_weight_chroma= 0;
3006     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3007     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3008     luma_def = 1<<h->luma_log2_weight_denom;
3009     chroma_def = 1<<h->chroma_log2_weight_denom;
3010
3011     for(list=0; list<2; list++){
3012         h->luma_weight_flag[list]   = 0;
3013         h->chroma_weight_flag[list] = 0;
3014         for(i=0; i<h->ref_count[list]; i++){
3015             int luma_weight_flag, chroma_weight_flag;
3016
3017             luma_weight_flag= get_bits1(&s->gb);
3018             if(luma_weight_flag){
3019                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3020                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3021                 if(   h->luma_weight[list][i] != luma_def
3022                    || h->luma_offset[list][i] != 0) {
3023                     h->use_weight= 1;
3024                     h->luma_weight_flag[list]= 1;
3025                 }
3026             }else{
3027                 h->luma_weight[list][i]= luma_def;
3028                 h->luma_offset[list][i]= 0;
3029             }
3030
3031             if(CHROMA){
3032                 chroma_weight_flag= get_bits1(&s->gb);
3033                 if(chroma_weight_flag){
3034                     int j;
3035                     for(j=0; j<2; j++){
3036                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3037                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3038                         if(   h->chroma_weight[list][i][j] != chroma_def
3039                            || h->chroma_offset[list][i][j] != 0) {
3040                             h->use_weight_chroma= 1;
3041                             h->chroma_weight_flag[list]= 1;
3042                         }
3043                     }
3044                 }else{
3045                     int j;
3046                     for(j=0; j<2; j++){
3047                         h->chroma_weight[list][i][j]= chroma_def;
3048                         h->chroma_offset[list][i][j]= 0;
3049                     }
3050                 }
3051             }
3052         }
3053         if(h->slice_type_nos != FF_B_TYPE) break;
3054     }
3055     h->use_weight= h->use_weight || h->use_weight_chroma;
3056     return 0;
3057 }
3058
3059 static void implicit_weight_table(H264Context *h){
3060     MpegEncContext * const s = &h->s;
3061     int ref0, ref1, i;
3062     int cur_poc = s->current_picture_ptr->poc;
3063
3064     for (i = 0; i < 2; i++) {
3065         h->luma_weight_flag[i]   = 0;
3066         h->chroma_weight_flag[i] = 0;
3067     }
3068
3069     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3070        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3071         h->use_weight= 0;
3072         h->use_weight_chroma= 0;
3073         return;
3074     }
3075
3076     h->use_weight= 2;
3077     h->use_weight_chroma= 2;
3078     h->luma_log2_weight_denom= 5;
3079     h->chroma_log2_weight_denom= 5;
3080
3081     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3082         int poc0 = h->ref_list[0][ref0].poc;
3083         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3084             int poc1 = h->ref_list[1][ref1].poc;
3085             int td = av_clip(poc1 - poc0, -128, 127);
3086             if(td){
3087                 int tb = av_clip(cur_poc - poc0, -128, 127);
3088                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3089                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3090                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3091                     h->implicit_weight[ref0][ref1] = 32;
3092                 else
3093                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3094             }else
3095                 h->implicit_weight[ref0][ref1] = 32;
3096         }
3097     }
3098 }
3099
3100 /**
3101  * Mark a picture as no longer needed for reference. The refmask
3102  * argument allows unreferencing of individual fields or the whole frame.
3103  * If the picture becomes entirely unreferenced, but is being held for
3104  * display purposes, it is marked as such.
3105  * @param refmask mask of fields to unreference; the mask is bitwise
3106  *                anded with the reference marking of pic
3107  * @return non-zero if pic becomes entirely unreferenced (except possibly
3108  *         for display purposes) zero if one of the fields remains in
3109  *         reference
3110  */
3111 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3112     int i;
3113     if (pic->reference &= refmask) {
3114         return 0;
3115     } else {
3116         for(i = 0; h->delayed_pic[i]; i++)
3117             if(pic == h->delayed_pic[i]){
3118                 pic->reference=DELAYED_PIC_REF;
3119                 break;
3120             }
3121         return 1;
3122     }
3123 }
3124
3125 /**
3126  * instantaneous decoder refresh.
3127  */
3128 static void idr(H264Context *h){
3129     int i;
3130
3131     for(i=0; i<16; i++){
3132         remove_long(h, i, 0);
3133     }
3134     assert(h->long_ref_count==0);
3135
3136     for(i=0; i<h->short_ref_count; i++){
3137         unreference_pic(h, h->short_ref[i], 0);
3138         h->short_ref[i]= NULL;
3139     }
3140     h->short_ref_count=0;
3141     h->prev_frame_num= 0;
3142     h->prev_frame_num_offset= 0;
3143     h->prev_poc_msb=
3144     h->prev_poc_lsb= 0;
3145 }
3146
3147 /* forget old pics after a seek */
3148 static void flush_dpb(AVCodecContext *avctx){
3149     H264Context *h= avctx->priv_data;
3150     int i;
3151     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3152         if(h->delayed_pic[i])
3153             h->delayed_pic[i]->reference= 0;
3154         h->delayed_pic[i]= NULL;
3155     }
3156     h->outputed_poc= INT_MIN;
3157     h->prev_interlaced_frame = 1;
3158     idr(h);
3159     if(h->s.current_picture_ptr)
3160         h->s.current_picture_ptr->reference= 0;
3161     h->s.first_field= 0;
3162     reset_sei(h);
3163     ff_mpeg_flush(avctx);
3164 }
3165
3166 /**
3167  * Find a Picture in the short term reference list by frame number.
3168  * @param frame_num frame number to search for
3169  * @param idx the index into h->short_ref where returned picture is found
3170  *            undefined if no picture found.
3171  * @return pointer to the found picture, or NULL if no pic with the provided
3172  *                 frame number is found
3173  */
3174 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3175     MpegEncContext * const s = &h->s;
3176     int i;
3177
3178     for(i=0; i<h->short_ref_count; i++){
3179         Picture *pic= h->short_ref[i];
3180         if(s->avctx->debug&FF_DEBUG_MMCO)
3181             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3182         if(pic->frame_num == frame_num) {
3183             *idx = i;
3184             return pic;
3185         }
3186     }
3187     return NULL;
3188 }
3189
3190 /**
3191  * Remove a picture from the short term reference list by its index in
3192  * that list.  This does no checking on the provided index; it is assumed
3193  * to be valid. Other list entries are shifted down.
3194  * @param i index into h->short_ref of picture to remove.
3195  */
3196 static void remove_short_at_index(H264Context *h, int i){
3197     assert(i >= 0 && i < h->short_ref_count);
3198     h->short_ref[i]= NULL;
3199     if (--h->short_ref_count)
3200         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3201 }
3202
3203 /**
3204  *
3205  * @return the removed picture or NULL if an error occurs
3206  */
3207 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3208     MpegEncContext * const s = &h->s;
3209     Picture *pic;
3210     int i;
3211
3212     if(s->avctx->debug&FF_DEBUG_MMCO)
3213         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3214
3215     pic = find_short(h, frame_num, &i);
3216     if (pic){
3217         if(unreference_pic(h, pic, ref_mask))
3218         remove_short_at_index(h, i);
3219     }
3220
3221     return pic;
3222 }
3223
3224 /**
3225  * Remove a picture from the long term reference list by its index in
3226  * that list.
3227  * @return the removed picture or NULL if an error occurs
3228  */
3229 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3230     Picture *pic;
3231
3232     pic= h->long_ref[i];
3233     if (pic){
3234         if(unreference_pic(h, pic, ref_mask)){
3235             assert(h->long_ref[i]->long_ref == 1);
3236             h->long_ref[i]->long_ref= 0;
3237             h->long_ref[i]= NULL;
3238             h->long_ref_count--;
3239         }
3240     }
3241
3242     return pic;
3243 }
3244
3245 /**
3246  * print short term list
3247  */
3248 static void print_short_term(H264Context *h) {
3249     uint32_t i;
3250     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3251         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3252         for(i=0; i<h->short_ref_count; i++){
3253             Picture *pic= h->short_ref[i];
3254             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3255         }
3256     }
3257 }
3258
3259 /**
3260  * print long term list
3261  */
3262 static void print_long_term(H264Context *h) {
3263     uint32_t i;
3264     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3265         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3266         for(i = 0; i < 16; i++){
3267             Picture *pic= h->long_ref[i];
3268             if (pic) {
3269                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3270             }
3271         }
3272     }
3273 }
3274
3275 /**
3276  * Executes the reference picture marking (memory management control operations).
3277  */
3278 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3279     MpegEncContext * const s = &h->s;
3280     int i, av_uninit(j);
3281     int current_ref_assigned=0;
3282     Picture *av_uninit(pic);
3283
3284     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3285         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3286
3287     for(i=0; i<mmco_count; i++){
3288         int av_uninit(structure), av_uninit(frame_num);
3289         if(s->avctx->debug&FF_DEBUG_MMCO)
3290             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3291
3292         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3293            || mmco[i].opcode == MMCO_SHORT2LONG){
3294             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3295             pic = find_short(h, frame_num, &j);
3296             if(!pic){
3297                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3298                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3299                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3300                 continue;
3301             }
3302         }
3303
3304         switch(mmco[i].opcode){
3305         case MMCO_SHORT2UNUSED:
3306             if(s->avctx->debug&FF_DEBUG_MMCO)
3307                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3308             remove_short(h, frame_num, structure ^ PICT_FRAME);
3309             break;
3310         case MMCO_SHORT2LONG:
3311                 if (h->long_ref[mmco[i].long_arg] != pic)
3312                     remove_long(h, mmco[i].long_arg, 0);
3313
3314                 remove_short_at_index(h, j);
3315                 h->long_ref[ mmco[i].long_arg ]= pic;
3316                 if (h->long_ref[ mmco[i].long_arg ]){
3317                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3318                     h->long_ref_count++;
3319                 }
3320             break;
3321         case MMCO_LONG2UNUSED:
3322             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3323             pic = h->long_ref[j];
3324             if (pic) {
3325                 remove_long(h, j, structure ^ PICT_FRAME);
3326             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3327                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3328             break;
3329         case MMCO_LONG:
3330                     // Comment below left from previous code as it is an interresting note.
3331                     /* First field in pair is in short term list or
3332                      * at a different long term index.
3333                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3334                      * Report the problem and keep the pair where it is,
3335                      * and mark this field valid.
3336                      */
3337
3338             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3339                 remove_long(h, mmco[i].long_arg, 0);
3340
3341                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3342                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3343                 h->long_ref_count++;
3344             }
3345
3346             s->current_picture_ptr->reference |= s->picture_structure;
3347             current_ref_assigned=1;
3348             break;
3349         case MMCO_SET_MAX_LONG:
3350             assert(mmco[i].long_arg <= 16);
3351             // just remove the long term which index is greater than new max
3352             for(j = mmco[i].long_arg; j<16; j++){
3353                 remove_long(h, j, 0);
3354             }
3355             break;
3356         case MMCO_RESET:
3357             while(h->short_ref_count){
3358                 remove_short(h, h->short_ref[0]->frame_num, 0);
3359             }
3360             for(j = 0; j < 16; j++) {
3361                 remove_long(h, j, 0);
3362             }
3363             s->current_picture_ptr->poc=
3364             s->current_picture_ptr->field_poc[0]=
3365             s->current_picture_ptr->field_poc[1]=
3366             h->poc_lsb=
3367             h->poc_msb=
3368             h->frame_num=
3369             s->current_picture_ptr->frame_num= 0;
3370             s->current_picture_ptr->mmco_reset=1;
3371             break;
3372         default: assert(0);
3373         }
3374     }
3375
3376     if (!current_ref_assigned) {
3377         /* Second field of complementary field pair; the first field of
3378          * which is already referenced. If short referenced, it
3379          * should be first entry in short_ref. If not, it must exist
3380          * in long_ref; trying to put it on the short list here is an
3381          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3382          */
3383         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3384             /* Just mark the second field valid */
3385             s->current_picture_ptr->reference = PICT_FRAME;
3386         } else if (s->current_picture_ptr->long_ref) {
3387             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3388                                              "assignment for second field "
3389                                              "in complementary field pair "
3390                                              "(first field is long term)\n");
3391         } else {
3392             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3393             if(pic){
3394                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3395             }
3396
3397             if(h->short_ref_count)
3398                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3399
3400             h->short_ref[0]= s->current_picture_ptr;
3401             h->short_ref_count++;
3402             s->current_picture_ptr->reference |= s->picture_structure;
3403         }
3404     }
3405
3406     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3407
3408         /* We have too many reference frames, probably due to corrupted
3409          * stream. Need to discard one frame. Prevents overrun of the
3410          * short_ref and long_ref buffers.
3411          */
3412         av_log(h->s.avctx, AV_LOG_ERROR,
3413                "number of reference frames exceeds max (probably "
3414                "corrupt input), discarding one\n");
3415
3416         if (h->long_ref_count && !h->short_ref_count) {
3417             for (i = 0; i < 16; ++i)
3418                 if (h->long_ref[i])
3419                     break;
3420
3421             assert(i < 16);
3422             remove_long(h, i, 0);
3423         } else {
3424             pic = h->short_ref[h->short_ref_count - 1];
3425             remove_short(h, pic->frame_num, 0);
3426         }
3427     }
3428
3429     print_short_term(h);
3430     print_long_term(h);
3431     return 0;
3432 }
3433
3434 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3435     MpegEncContext * const s = &h->s;
3436     int i;
3437
3438     h->mmco_index= 0;
3439     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3440         s->broken_link= get_bits1(gb) -1;
3441         if(get_bits1(gb)){
3442             h->mmco[0].opcode= MMCO_LONG;
3443             h->mmco[0].long_arg= 0;
3444             h->mmco_index= 1;
3445         }
3446     }else{
3447         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3448             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3449                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3450
3451                 h->mmco[i].opcode= opcode;
3452                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3453                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3454 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3455                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3456                         return -1;
3457                     }*/
3458                 }
3459                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3460                     unsigned int long_arg= get_ue_golomb_31(gb);
3461                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3462                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3463                         return -1;
3464                     }
3465                     h->mmco[i].long_arg= long_arg;
3466                 }
3467
3468                 if(opcode > (unsigned)MMCO_LONG){
3469                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3470                     return -1;
3471                 }
3472                 if(opcode == MMCO_END)
3473                     break;
3474             }
3475             h->mmco_index= i;
3476         }else{
3477             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3478
3479             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3480                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3481                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3482                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3483                 h->mmco_index= 1;
3484                 if (FIELD_PICTURE) {
3485                     h->mmco[0].short_pic_num *= 2;
3486                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3487                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3488                     h->mmco_index= 2;
3489                 }
3490             }
3491         }
3492     }
3493
3494     return 0;
3495 }
3496
3497 static int init_poc(H264Context *h){
3498     MpegEncContext * const s = &h->s;
3499     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3500     int field_poc[2];
3501     Picture *cur = s->current_picture_ptr;
3502
3503     h->frame_num_offset= h->prev_frame_num_offset;
3504     if(h->frame_num < h->prev_frame_num)
3505         h->frame_num_offset += max_frame_num;
3506
3507     if(h->sps.poc_type==0){
3508         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3509
3510         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3511             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3512         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3513             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3514         else
3515             h->poc_msb = h->prev_poc_msb;
3516 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3517         field_poc[0] =
3518         field_poc[1] = h->poc_msb + h->poc_lsb;
3519         if(s->picture_structure == PICT_FRAME)
3520             field_poc[1] += h->delta_poc_bottom;
3521     }else if(h->sps.poc_type==1){
3522         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3523         int i;
3524
3525         if(h->sps.poc_cycle_length != 0)
3526             abs_frame_num = h->frame_num_offset + h->frame_num;
3527         else
3528             abs_frame_num = 0;
3529
3530         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3531             abs_frame_num--;
3532
3533         expected_delta_per_poc_cycle = 0;
3534         for(i=0; i < h->sps.poc_cycle_length; i++)
3535             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3536
3537         if(abs_frame_num > 0){
3538             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3539             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3540
3541             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3542             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3543                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3544         } else
3545             expectedpoc = 0;
3546
3547         if(h->nal_ref_idc == 0)
3548             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3549
3550         field_poc[0] = expectedpoc + h->delta_poc[0];
3551         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3552
3553         if(s->picture_structure == PICT_FRAME)
3554             field_poc[1] += h->delta_poc[1];
3555     }else{
3556         int poc= 2*(h->frame_num_offset + h->frame_num);
3557
3558         if(!h->nal_ref_idc)
3559             poc--;
3560
3561         field_poc[0]= poc;
3562         field_poc[1]= poc;
3563     }
3564
3565     if(s->picture_structure != PICT_BOTTOM_FIELD)
3566         s->current_picture_ptr->field_poc[0]= field_poc[0];
3567     if(s->picture_structure != PICT_TOP_FIELD)
3568         s->current_picture_ptr->field_poc[1]= field_poc[1];
3569     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3570
3571     return 0;
3572 }
3573
3574
3575 /**
3576  * initialize scan tables
3577  */
3578 static void init_scan_tables(H264Context *h){
3579     MpegEncContext * const s = &h->s;
3580     int i;
3581     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3582         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3583         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3584     }else{
3585         for(i=0; i<16; i++){
3586 #define T(x) (x>>2) | ((x<<2) & 0xF)
3587             h->zigzag_scan[i] = T(zigzag_scan[i]);
3588             h-> field_scan[i] = T( field_scan[i]);
3589 #undef T
3590         }
3591     }
3592     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3593         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3594         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3595         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3596         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3597     }else{
3598         for(i=0; i<64; i++){
3599 #define T(x) (x>>3) | ((x&7)<<3)
3600             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3601             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3602             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3603             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3604 #undef T
3605         }
3606     }
3607     if(h->sps.transform_bypass){ //FIXME same ugly
3608         h->zigzag_scan_q0          = zigzag_scan;
3609         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3610         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3611         h->field_scan_q0           = field_scan;
3612         h->field_scan8x8_q0        = field_scan8x8;
3613         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3614     }else{
3615         h->zigzag_scan_q0          = h->zigzag_scan;
3616         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3617         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3618         h->field_scan_q0           = h->field_scan;
3619         h->field_scan8x8_q0        = h->field_scan8x8;
3620         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3621     }
3622 }
3623
3624 static void field_end(H264Context *h){
3625     MpegEncContext * const s = &h->s;
3626     AVCodecContext * const avctx= s->avctx;
3627     s->mb_y= 0;
3628
3629     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3630     s->current_picture_ptr->pict_type= s->pict_type;
3631
3632     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3633         ff_vdpau_h264_set_reference_frames(s);
3634
3635     if(!s->dropable) {
3636         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3637         h->prev_poc_msb= h->poc_msb;
3638         h->prev_poc_lsb= h->poc_lsb;
3639     }
3640     h->prev_frame_num_offset= h->frame_num_offset;
3641     h->prev_frame_num= h->frame_num;
3642
3643     if (avctx->hwaccel) {
3644         if (avctx->hwaccel->end_frame(avctx) < 0)
3645             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3646     }
3647
3648     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3649         ff_vdpau_h264_picture_complete(s);
3650
3651     /*
3652      * FIXME: Error handling code does not seem to support interlaced
3653      * when slices span multiple rows
3654      * The ff_er_add_slice calls don't work right for bottom
3655      * fields; they cause massive erroneous error concealing
3656      * Error marking covers both fields (top and bottom).
3657      * This causes a mismatched s->error_count
3658      * and a bad error table. Further, the error count goes to
3659      * INT_MAX when called for bottom field, because mb_y is
3660      * past end by one (callers fault) and resync_mb_y != 0
3661      * causes problems for the first MB line, too.
3662      */
3663     if (!FIELD_PICTURE)
3664         ff_er_frame_end(s);
3665
3666     MPV_frame_end(s);
3667
3668     h->current_slice=0;
3669 }
3670
3671 /**
3672  * Replicates H264 "master" context to thread contexts.
3673  */
3674 static void clone_slice(H264Context *dst, H264Context *src)
3675 {
3676     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3677     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3678     dst->s.current_picture      = src->s.current_picture;
3679     dst->s.linesize             = src->s.linesize;
3680     dst->s.uvlinesize           = src->s.uvlinesize;
3681     dst->s.first_field          = src->s.first_field;
3682
3683     dst->prev_poc_msb           = src->prev_poc_msb;
3684     dst->prev_poc_lsb           = src->prev_poc_lsb;
3685     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3686     dst->prev_frame_num         = src->prev_frame_num;
3687     dst->short_ref_count        = src->short_ref_count;
3688
3689     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3690     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3691     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3692     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3693
3694     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3695     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3696 }
3697
3698 /**
3699  * decodes a slice header.
3700  * This will also call MPV_common_init() and frame_start() as needed.
3701  *
3702  * @param h h264context
3703  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3704  *
3705  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3706  */
3707 static int decode_slice_header(H264Context *h, H264Context *h0){
3708     MpegEncContext * const s = &h->s;
3709     MpegEncContext * const s0 = &h0->s;
3710     unsigned int first_mb_in_slice;
3711     unsigned int pps_id;
3712     int num_ref_idx_active_override_flag;
3713     unsigned int slice_type, tmp, i, j;
3714     int default_ref_list_done = 0;
3715     int last_pic_structure;
3716
3717     s->dropable= h->nal_ref_idc == 0;
3718
3719     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3720         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3721         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3722     }else{
3723         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3724         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3725     }
3726
3727     first_mb_in_slice= get_ue_golomb(&s->gb);
3728
3729     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3730         if(h0->current_slice && FIELD_PICTURE){
3731             field_end(h);
3732         }
3733
3734         h0->current_slice = 0;
3735         if (!s0->first_field)
3736             s->current_picture_ptr= NULL;
3737     }
3738
3739     slice_type= get_ue_golomb_31(&s->gb);
3740     if(slice_type > 9){
3741         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3742         return -1;
3743     }
3744     if(slice_type > 4){
3745         slice_type -= 5;
3746         h->slice_type_fixed=1;
3747     }else
3748         h->slice_type_fixed=0;
3749
3750     slice_type= golomb_to_pict_type[ slice_type ];
3751     if (slice_type == FF_I_TYPE
3752         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3753         default_ref_list_done = 1;
3754     }
3755     h->slice_type= slice_type;
3756     h->slice_type_nos= slice_type & 3;
3757
3758     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3759     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3760         av_log(h->s.avctx, AV_LOG_ERROR,
3761                "B picture before any references, skipping\n");
3762         return -1;
3763     }
3764
3765     pps_id= get_ue_golomb(&s->gb);
3766     if(pps_id>=MAX_PPS_COUNT){
3767         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3768         return -1;
3769     }
3770     if(!h0->pps_buffers[pps_id]) {
3771         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3772         return -1;
3773     }
3774     h->pps= *h0->pps_buffers[pps_id];
3775
3776     if(!h0->sps_buffers[h->pps.sps_id]) {
3777         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3778         return -1;
3779     }
3780     h->sps = *h0->sps_buffers[h->pps.sps_id];
3781
3782     if(h == h0 && h->dequant_coeff_pps != pps_id){
3783         h->dequant_coeff_pps = pps_id;
3784         init_dequant_tables(h);
3785     }
3786
3787     s->mb_width= h->sps.mb_width;
3788     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3789
3790     h->b_stride=  s->mb_width*4;
3791     h->b8_stride= s->mb_width*2;
3792
3793     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3794     if(h->sps.frame_mbs_only_flag)
3795         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3796     else
3797         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3798
3799     if (s->context_initialized
3800         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3801         if(h != h0)
3802             return -1;   // width / height changed during parallelized decoding
3803         free_tables(h);
3804         flush_dpb(s->avctx);
3805         MPV_common_end(s);
3806     }
3807     if (!s->context_initialized) {
3808         if(h != h0)
3809             return -1;  // we cant (re-)initialize context during parallel decoding
3810
3811         avcodec_set_dimensions(s->avctx, s->width, s->height);
3812         s->avctx->sample_aspect_ratio= h->sps.sar;
3813         if(!s->avctx->sample_aspect_ratio.den)
3814             s->avctx->sample_aspect_ratio.den = 1;
3815
3816         if(h->sps.timing_info_present_flag){
3817             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3818             if(h->x264_build > 0 && h->x264_build < 44)
3819                 s->avctx->time_base.den *= 2;
3820             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3821                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3822         }
3823         s->avctx->pix_fmt = s->avctx->get_format(s->avctx, s->avctx->codec->pix_fmts);
3824         s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
3825
3826         if (MPV_common_init(s) < 0)
3827             return -1;
3828         s->first_field = 0;
3829         h->prev_interlaced_frame = 1;
3830
3831         init_scan_tables(h);
3832         alloc_tables(h);
3833
3834         for(i = 1; i < s->avctx->thread_count; i++) {
3835             H264Context *c;
3836             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3837             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3838             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3839             c->sps = h->sps;
3840             c->pps = h->pps;
3841             init_scan_tables(c);
3842             clone_tables(c, h);
3843         }
3844
3845         for(i = 0; i < s->avctx->thread_count; i++)
3846             if(context_init(h->thread_context[i]) < 0)
3847                 return -1;
3848     }
3849
3850     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3851
3852     h->mb_mbaff = 0;
3853     h->mb_aff_frame = 0;
3854     last_pic_structure = s0->picture_structure;
3855     if(h->sps.frame_mbs_only_flag){
3856         s->picture_structure= PICT_FRAME;
3857     }else{
3858         if(get_bits1(&s->gb)) { //field_pic_flag
3859             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3860         } else {
3861             s->picture_structure= PICT_FRAME;
3862             h->mb_aff_frame = h->sps.mb_aff;
3863         }
3864     }
3865     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3866
3867     if(h0->current_slice == 0){
3868         while(h->frame_num !=  h->prev_frame_num &&
3869               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3870             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3871             if (frame_start(h) < 0)
3872                 return -1;
3873             h->prev_frame_num++;
3874             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3875             s->current_picture_ptr->frame_num= h->prev_frame_num;
3876             execute_ref_pic_marking(h, NULL, 0);
3877         }
3878
3879         /* See if we have a decoded first field looking for a pair... */
3880         if (s0->first_field) {
3881             assert(s0->current_picture_ptr);
3882             assert(s0->current_picture_ptr->data[0]);
3883             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3884
3885             /* figure out if we have a complementary field pair */
3886             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3887                 /*
3888                  * Previous field is unmatched. Don't display it, but let it
3889                  * remain for reference if marked as such.
3890                  */
3891                 s0->current_picture_ptr = NULL;
3892                 s0->first_field = FIELD_PICTURE;
3893
3894             } else {
3895                 if (h->nal_ref_idc &&
3896                         s0->current_picture_ptr->reference &&
3897                         s0->current_picture_ptr->frame_num != h->frame_num) {
3898                     /*
3899                      * This and previous field were reference, but had
3900                      * different frame_nums. Consider this field first in
3901                      * pair. Throw away previous field except for reference
3902                      * purposes.
3903                      */
3904                     s0->first_field = 1;
3905                     s0->current_picture_ptr = NULL;
3906
3907                 } else {
3908                     /* Second field in complementary pair */
3909                     s0->first_field = 0;
3910                 }
3911             }
3912
3913         } else {
3914             /* Frame or first field in a potentially complementary pair */
3915             assert(!s0->current_picture_ptr);
3916             s0->first_field = FIELD_PICTURE;
3917         }
3918
3919         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3920             s0->first_field = 0;
3921             return -1;
3922         }
3923     }
3924     if(h != h0)
3925         clone_slice(h, h0);
3926
3927     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3928
3929     assert(s->mb_num == s->mb_width * s->mb_height);
3930     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3931        first_mb_in_slice                    >= s->mb_num){
3932         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3933         return -1;
3934     }
3935     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3936     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3937     if (s->picture_structure == PICT_BOTTOM_FIELD)
3938         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3939     assert(s->mb_y < s->mb_height);
3940
3941     if(s->picture_structure==PICT_FRAME){
3942         h->curr_pic_num=   h->frame_num;
3943         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3944     }else{
3945         h->curr_pic_num= 2*h->frame_num + 1;
3946         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3947     }
3948
3949     if(h->nal_unit_type == NAL_IDR_SLICE){
3950         get_ue_golomb(&s->gb); /* idr_pic_id */
3951     }
3952
3953     if(h->sps.poc_type==0){
3954         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3955
3956         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3957             h->delta_poc_bottom= get_se_golomb(&s->gb);
3958         }
3959     }
3960
3961     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3962         h->delta_poc[0]= get_se_golomb(&s->gb);
3963
3964         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3965             h->delta_poc[1]= get_se_golomb(&s->gb);
3966     }
3967
3968     init_poc(h);
3969
3970     if(h->pps.redundant_pic_cnt_present){
3971         h->redundant_pic_count= get_ue_golomb(&s->gb);
3972     }
3973
3974     //set defaults, might be overridden a few lines later
3975     h->ref_count[0]= h->pps.ref_count[0];
3976     h->ref_count[1]= h->pps.ref_count[1];
3977
3978     if(h->slice_type_nos != FF_I_TYPE){
3979         if(h->slice_type_nos == FF_B_TYPE){
3980             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3981         }
3982         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3983
3984         if(num_ref_idx_active_override_flag){
3985             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3986             if(h->slice_type_nos==FF_B_TYPE)
3987                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3988
3989             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3990                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3991                 h->ref_count[0]= h->ref_count[1]= 1;
3992                 return -1;
3993             }
3994         }
3995         if(h->slice_type_nos == FF_B_TYPE)
3996             h->list_count= 2;
3997         else
3998             h->list_count= 1;
3999     }else
4000         h->list_count= 0;
4001
4002     if(!default_ref_list_done){
4003         fill_default_ref_list(h);
4004     }
4005
4006     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4007         return -1;
4008
4009     if(h->slice_type_nos!=FF_I_TYPE){
4010         s->last_picture_ptr= &h->ref_list[0][0];
4011         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4012     }
4013     if(h->slice_type_nos==FF_B_TYPE){
4014         s->next_picture_ptr= &h->ref_list[1][0];
4015         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4016     }
4017
4018     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4019        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4020         pred_weight_table(h);
4021     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4022         implicit_weight_table(h);
4023     else {
4024         h->use_weight = 0;
4025         for (i = 0; i < 2; i++) {
4026             h->luma_weight_flag[i]   = 0;
4027             h->chroma_weight_flag[i] = 0;
4028         }
4029     }
4030
4031     if(h->nal_ref_idc)
4032         decode_ref_pic_marking(h0, &s->gb);
4033
4034     if(FRAME_MBAFF)
4035         fill_mbaff_ref_list(h);
4036
4037     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4038         direct_dist_scale_factor(h);
4039     direct_ref_list_init(h);
4040
4041     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4042         tmp = get_ue_golomb_31(&s->gb);
4043         if(tmp > 2){
4044             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4045             return -1;
4046         }
4047         h->cabac_init_idc= tmp;
4048     }
4049
4050     h->last_qscale_diff = 0;
4051     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4052     if(tmp>51){
4053         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4054         return -1;
4055     }
4056     s->qscale= tmp;
4057     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4058     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4059     //FIXME qscale / qp ... stuff
4060     if(h->slice_type == FF_SP_TYPE){
4061         get_bits1(&s->gb); /* sp_for_switch_flag */
4062     }
4063     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4064         get_se_golomb(&s->gb); /* slice_qs_delta */
4065     }
4066
4067     h->deblocking_filter = 1;
4068     h->slice_alpha_c0_offset = 0;
4069     h->slice_beta_offset = 0;
4070     if( h->pps.deblocking_filter_parameters_present ) {
4071         tmp= get_ue_golomb_31(&s->gb);
4072         if(tmp > 2){
4073             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4074             return -1;
4075         }
4076         h->deblocking_filter= tmp;
4077         if(h->deblocking_filter < 2)
4078             h->deblocking_filter^= 1; // 1<->0
4079
4080         if( h->deblocking_filter ) {
4081             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4082             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4083         }
4084     }
4085
4086     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4087        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4088        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4089        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4090         h->deblocking_filter= 0;
4091
4092     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4093         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4094             /* Cheat slightly for speed:
4095                Do not bother to deblock across slices. */
4096             h->deblocking_filter = 2;
4097         } else {
4098             h0->max_contexts = 1;
4099             if(!h0->single_decode_warning) {
4100                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4101                 h0->single_decode_warning = 1;
4102             }
4103             if(h != h0)
4104                 return 1; // deblocking switched inside frame
4105         }
4106     }
4107
4108 #if 0 //FMO
4109     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4110         slice_group_change_cycle= get_bits(&s->gb, ?);
4111 #endif
4112
4113     h0->last_slice_type = slice_type;
4114     h->slice_num = ++h0->current_slice;
4115     if(h->slice_num >= MAX_SLICES){
4116         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4117     }
4118
4119     for(j=0; j<2; j++){
4120         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4121         ref2frm[0]=
4122         ref2frm[1]= -1;
4123         for(i=0; i<16; i++)
4124             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4125                           +(h->ref_list[j][i].reference&3);
4126         ref2frm[18+0]=
4127         ref2frm[18+1]= -1;
4128         for(i=16; i<48; i++)
4129             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4130                           +(h->ref_list[j][i].reference&3);
4131     }
4132
4133     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4134     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4135
4136     s->avctx->refs= h->sps.ref_frame_count;
4137
4138     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4139         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4140                h->slice_num,
4141                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4142                first_mb_in_slice,
4143                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4144                pps_id, h->frame_num,
4145                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4146                h->ref_count[0], h->ref_count[1],
4147                s->qscale,
4148                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4149                h->use_weight,
4150                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4151                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4152                );
4153     }
4154
4155     return 0;
4156 }
4157
4158 /**
4159  *
4160  */
4161 static inline int get_level_prefix(GetBitContext *gb){
4162     unsigned int buf;
4163     int log;
4164
4165     OPEN_READER(re, gb);
4166     UPDATE_CACHE(re, gb);
4167     buf=GET_CACHE(re, gb);
4168
4169     log= 32 - av_log2(buf);
4170 #ifdef TRACE
4171     print_bin(buf>>(32-log), log);
4172     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4173 #endif
4174
4175     LAST_SKIP_BITS(re, gb, log);
4176     CLOSE_READER(re, gb);
4177
4178     return log-1;
4179 }
4180
4181 static inline int get_dct8x8_allowed(H264Context *h){
4182     if(h->sps.direct_8x8_inference_flag)
4183         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4184     else
4185         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4186 }
4187
4188 /**
4189  * decodes a residual block.
4190  * @param n block index
4191  * @param scantable scantable
4192  * @param max_coeff number of coefficients in the block
4193  * @return <0 if an error occurred
4194  */
4195 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4196     MpegEncContext * const s = &h->s;
4197     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4198     int level[16];
4199     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4200
4201     //FIXME put trailing_onex into the context
4202
4203     if(n == CHROMA_DC_BLOCK_INDEX){
4204         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4205         total_coeff= coeff_token>>2;
4206     }else{
4207         if(n == LUMA_DC_BLOCK_INDEX){
4208             total_coeff= pred_non_zero_count(h, 0);
4209             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4210             total_coeff= coeff_token>>2;
4211         }else{
4212             total_coeff= pred_non_zero_count(h, n);
4213             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4214             total_coeff= coeff_token>>2;
4215             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4216         }
4217     }
4218
4219     //FIXME set last_non_zero?
4220
4221     if(total_coeff==0)
4222         return 0;
4223     if(total_coeff > (unsigned)max_coeff) {
4224         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4225         return -1;
4226     }
4227
4228     trailing_ones= coeff_token&3;
4229     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4230     assert(total_coeff<=16);
4231
4232     i = show_bits(gb, 3);
4233     skip_bits(gb, trailing_ones);
4234     level[0] = 1-((i&4)>>1);
4235     level[1] = 1-((i&2)   );
4236     level[2] = 1-((i&1)<<1);
4237
4238     if(trailing_ones<total_coeff) {
4239         int mask, prefix;
4240         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4241         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4242         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4243
4244         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4245         if(level_code >= 100){
4246             prefix= level_code - 100;
4247             if(prefix == LEVEL_TAB_BITS)
4248                 prefix += get_level_prefix(gb);
4249
4250             //first coefficient has suffix_length equal to 0 or 1
4251             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4252                 if(suffix_length)
4253                     level_code= (prefix<<1) + get_bits1(gb); //part
4254                 else
4255                     level_code= prefix; //part
4256             }else if(prefix==14){
4257                 if(suffix_length)
4258                     level_code= (prefix<<1) + get_bits1(gb); //part
4259                 else
4260                     level_code= prefix + get_bits(gb, 4); //part
4261             }else{
4262                 level_code= 30 + get_bits(gb, prefix-3); //part
4263                 if(prefix>=16)
4264                     level_code += (1<<(prefix-3))-4096;
4265             }
4266
4267             if(trailing_ones < 3) level_code += 2;
4268
4269             suffix_length = 2;
4270             mask= -(level_code&1);
4271             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4272         }else{
4273             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4274
4275             suffix_length = 1;
4276             if(level_code + 3U > 6U)
4277                 suffix_length++;
4278             level[trailing_ones]= level_code;
4279         }
4280
4281         //remaining coefficients have suffix_length > 0
4282         for(i=trailing_ones+1;i<total_coeff;i++) {
4283             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4284             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4285             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4286
4287             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4288             if(level_code >= 100){
4289                 prefix= level_code - 100;
4290                 if(prefix == LEVEL_TAB_BITS){
4291                     prefix += get_level_prefix(gb);
4292                 }
4293                 if(prefix<15){
4294                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4295                 }else{
4296                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4297                     if(prefix>=16)
4298                         level_code += (1<<(prefix-3))-4096;
4299                 }
4300                 mask= -(level_code&1);
4301                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4302             }
4303             level[i]= level_code;
4304
4305             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4306                 suffix_length++;
4307         }
4308     }
4309
4310     if(total_coeff == max_coeff)
4311         zeros_left=0;
4312     else{
4313         if(n == CHROMA_DC_BLOCK_INDEX)
4314             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4315         else
4316             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4317     }
4318
4319     coeff_num = zeros_left + total_coeff - 1;
4320     j = scantable[coeff_num];
4321     if(n > 24){
4322         block[j] = level[0];
4323         for(i=1;i<total_coeff;i++) {
4324             if(zeros_left <= 0)
4325                 run_before = 0;
4326             else if(zeros_left < 7){
4327                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4328             }else{
4329                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4330             }
4331             zeros_left -= run_before;
4332             coeff_num -= 1 + run_before;
4333             j= scantable[ coeff_num ];
4334
4335             block[j]= level[i];
4336         }
4337     }else{
4338         block[j] = (level[0] * qmul[j] + 32)>>6;
4339         for(i=1;i<total_coeff;i++) {
4340             if(zeros_left <= 0)
4341                 run_before = 0;
4342             else if(zeros_left < 7){
4343                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4344             }else{
4345                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4346             }
4347             zeros_left -= run_before;
4348             coeff_num -= 1 + run_before;
4349             j= scantable[ coeff_num ];
4350
4351             block[j]= (level[i] * qmul[j] + 32)>>6;
4352         }
4353     }
4354
4355     if(zeros_left<0){
4356         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4357         return -1;
4358     }
4359
4360     return 0;
4361 }
4362
4363 static void predict_field_decoding_flag(H264Context *h){
4364     MpegEncContext * const s = &h->s;
4365     const int mb_xy= h->mb_xy;
4366     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4367                 ? s->current_picture.mb_type[mb_xy-1]
4368                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4369                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4370                 : 0;
4371     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4372 }
4373
4374 /**
4375  * decodes a P_SKIP or B_SKIP macroblock
4376  */
4377 static void decode_mb_skip(H264Context *h){
4378     MpegEncContext * const s = &h->s;
4379     const int mb_xy= h->mb_xy;
4380     int mb_type=0;
4381
4382     memset(h->non_zero_count[mb_xy], 0, 16);
4383     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4384
4385     if(MB_FIELD)
4386         mb_type|= MB_TYPE_INTERLACED;
4387
4388     if( h->slice_type_nos == FF_B_TYPE )
4389     {
4390         // just for fill_caches. pred_direct_motion will set the real mb_type
4391         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4392
4393         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4394         pred_direct_motion(h, &mb_type);
4395         mb_type|= MB_TYPE_SKIP;
4396     }
4397     else
4398     {
4399         int mx, my;
4400         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4401
4402         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4403         pred_pskip_motion(h, &mx, &my);
4404         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4405         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4406     }
4407
4408     write_back_motion(h, mb_type);
4409     s->current_picture.mb_type[mb_xy]= mb_type;
4410     s->current_picture.qscale_table[mb_xy]= s->qscale;
4411     h->slice_table[ mb_xy ]= h->slice_num;
4412     h->prev_mb_skipped= 1;
4413 }
4414
4415 /**
4416  * decodes a macroblock
4417  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4418  */
4419 static int decode_mb_cavlc(H264Context *h){
4420     MpegEncContext * const s = &h->s;
4421     int mb_xy;
4422     int partition_count;
4423     unsigned int mb_type, cbp;
4424     int dct8x8_allowed= h->pps.transform_8x8_mode;
4425
4426     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4427
4428     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4429     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4430                 down the code */
4431     if(h->slice_type_nos != FF_I_TYPE){
4432         if(s->mb_skip_run==-1)
4433             s->mb_skip_run= get_ue_golomb(&s->gb);
4434
4435         if (s->mb_skip_run--) {
4436             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4437                 if(s->mb_skip_run==0)
4438                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4439                 else
4440                     predict_field_decoding_flag(h);
4441             }
4442             decode_mb_skip(h);
4443             return 0;
4444         }
4445     }
4446     if(FRAME_MBAFF){
4447         if( (s->mb_y&1) == 0 )
4448             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4449     }
4450
4451     h->prev_mb_skipped= 0;
4452
4453     mb_type= get_ue_golomb(&s->gb);
4454     if(h->slice_type_nos == FF_B_TYPE){
4455         if(mb_type < 23){
4456             partition_count= b_mb_type_info[mb_type].partition_count;
4457             mb_type=         b_mb_type_info[mb_type].type;
4458         }else{
4459             mb_type -= 23;
4460             goto decode_intra_mb;
4461         }
4462     }else if(h->slice_type_nos == FF_P_TYPE){
4463         if(mb_type < 5){
4464             partition_count= p_mb_type_info[mb_type].partition_count;
4465             mb_type=         p_mb_type_info[mb_type].type;
4466         }else{
4467             mb_type -= 5;
4468             goto decode_intra_mb;
4469         }
4470     }else{
4471        assert(h->slice_type_nos == FF_I_TYPE);
4472         if(h->slice_type == FF_SI_TYPE && mb_type)
4473             mb_type--;
4474 decode_intra_mb:
4475         if(mb_type > 25){
4476             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4477             return -1;
4478         }
4479         partition_count=0;
4480         cbp= i_mb_type_info[mb_type].cbp;
4481         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4482         mb_type= i_mb_type_info[mb_type].type;
4483     }
4484
4485     if(MB_FIELD)
4486         mb_type |= MB_TYPE_INTERLACED;
4487
4488     h->slice_table[ mb_xy ]= h->slice_num;
4489
4490     if(IS_INTRA_PCM(mb_type)){
4491         unsigned int x;
4492
4493         // We assume these blocks are very rare so we do not optimize it.
4494         align_get_bits(&s->gb);
4495
4496         // The pixels are stored in the same order as levels in h->mb array.
4497         for(x=0; x < (CHROMA ? 384 : 256); x++){
4498             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4499         }
4500
4501         // In deblocking, the quantizer is 0
4502         s->current_picture.qscale_table[mb_xy]= 0;
4503         // All coeffs are present
4504         memset(h->non_zero_count[mb_xy], 16, 16);
4505
4506         s->current_picture.mb_type[mb_xy]= mb_type;
4507         return 0;
4508     }
4509
4510     if(MB_MBAFF){
4511         h->ref_count[0] <<= 1;
4512         h->ref_count[1] <<= 1;
4513     }
4514
4515     fill_caches(h, mb_type, 0);
4516
4517     //mb_pred
4518     if(IS_INTRA(mb_type)){
4519         int pred_mode;
4520 //            init_top_left_availability(h);
4521         if(IS_INTRA4x4(mb_type)){
4522             int i;
4523             int di = 1;
4524             if(dct8x8_allowed && get_bits1(&s->gb)){
4525                 mb_type |= MB_TYPE_8x8DCT;
4526                 di = 4;
4527             }
4528
4529 //                fill_intra4x4_pred_table(h);
4530             for(i=0; i<16; i+=di){
4531                 int mode= pred_intra_mode(h, i);
4532
4533                 if(!get_bits1(&s->gb)){
4534                     const int rem_mode= get_bits(&s->gb, 3);
4535                     mode = rem_mode + (rem_mode >= mode);
4536                 }
4537
4538                 if(di==4)
4539                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4540                 else
4541                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4542             }
4543             write_back_intra_pred_mode(h);
4544             if( check_intra4x4_pred_mode(h) < 0)
4545                 return -1;
4546         }else{
4547             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4548             if(h->intra16x16_pred_mode < 0)
4549                 return -1;
4550         }
4551         if(CHROMA){
4552             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4553             if(pred_mode < 0)
4554                 return -1;
4555             h->chroma_pred_mode= pred_mode;
4556         }
4557     }else if(partition_count==4){
4558         int i, j, sub_partition_count[4], list, ref[2][4];
4559
4560         if(h->slice_type_nos == FF_B_TYPE){
4561             for(i=0; i<4; i++){
4562                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4563                 if(h->sub_mb_type[i] >=13){
4564                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4565                     return -1;
4566                 }
4567                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4568                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4569             }
4570             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4571                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4572                 pred_direct_motion(h, &mb_type);
4573                 h->ref_cache[0][scan8[4]] =
4574                 h->ref_cache[1][scan8[4]] =
4575                 h->ref_cache[0][scan8[12]] =
4576                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4577             }
4578         }else{
4579             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4580             for(i=0; i<4; i++){
4581                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4582                 if(h->sub_mb_type[i] >=4){
4583                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4584                     return -1;
4585                 }
4586                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4587                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4588             }
4589         }
4590
4591         for(list=0; list<h->list_count; list++){
4592             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4593             for(i=0; i<4; i++){
4594                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4595                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4596                     unsigned int tmp;
4597                     if(ref_count == 1){
4598                         tmp= 0;
4599                     }else if(ref_count == 2){
4600                         tmp= get_bits1(&s->gb)^1;
4601                     }else{
4602                         tmp= get_ue_golomb_31(&s->gb);
4603                         if(tmp>=ref_count){
4604                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4605                             return -1;
4606                         }
4607                     }
4608                     ref[list][i]= tmp;
4609                 }else{
4610                  //FIXME
4611                     ref[list][i] = -1;
4612                 }
4613             }
4614         }
4615
4616         if(dct8x8_allowed)
4617             dct8x8_allowed = get_dct8x8_allowed(h);
4618
4619         for(list=0; list<h->list_count; list++){
4620             for(i=0; i<4; i++){
4621                 if(IS_DIRECT(h->sub_mb_type[i])) {
4622                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4623                     continue;
4624                 }
4625                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4626                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4627
4628                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4629                     const int sub_mb_type= h->sub_mb_type[i];
4630                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4631                     for(j=0; j<sub_partition_count[i]; j++){
4632                         int mx, my;
4633                         const int index= 4*i + block_width*j;
4634                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4635                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4636                         mx += get_se_golomb(&s->gb);
4637                         my += get_se_golomb(&s->gb);
4638                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4639
4640                         if(IS_SUB_8X8(sub_mb_type)){
4641                             mv_cache[ 1 ][0]=
4642                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4643                             mv_cache[ 1 ][1]=
4644                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4645                         }else if(IS_SUB_8X4(sub_mb_type)){
4646                             mv_cache[ 1 ][0]= mx;
4647                             mv_cache[ 1 ][1]= my;
4648                         }else if(IS_SUB_4X8(sub_mb_type)){
4649                             mv_cache[ 8 ][0]= mx;
4650                             mv_cache[ 8 ][1]= my;
4651                         }
4652                         mv_cache[ 0 ][0]= mx;
4653                         mv_cache[ 0 ][1]= my;
4654                     }
4655                 }else{
4656                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4657                     p[0] = p[1]=
4658                     p[8] = p[9]= 0;
4659                 }
4660             }
4661         }
4662     }else if(IS_DIRECT(mb_type)){
4663         pred_direct_motion(h, &mb_type);
4664         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4665     }else{
4666         int list, mx, my, i;
4667          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4668         if(IS_16X16(mb_type)){
4669             for(list=0; list<h->list_count; list++){
4670                     unsigned int val;
4671                     if(IS_DIR(mb_type, 0, list)){
4672                         if(h->ref_count[list]==1){
4673                             val= 0;
4674                         }else if(h->ref_count[list]==2){
4675                             val= get_bits1(&s->gb)^1;
4676                         }else{
4677                             val= get_ue_golomb_31(&s->gb);
4678                             if(val >= h->ref_count[list]){
4679                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4680                                 return -1;
4681                             }
4682                         }
4683                     }else
4684                         val= LIST_NOT_USED&0xFF;
4685                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4686             }
4687             for(list=0; list<h->list_count; list++){
4688                 unsigned int val;
4689                 if(IS_DIR(mb_type, 0, list)){
4690                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4691                     mx += get_se_golomb(&s->gb);
4692                     my += get_se_golomb(&s->gb);
4693                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4694
4695                     val= pack16to32(mx,my);
4696                 }else
4697                     val=0;
4698                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4699             }
4700         }
4701         else if(IS_16X8(mb_type)){
4702             for(list=0; list<h->list_count; list++){
4703                     for(i=0; i<2; i++){
4704                         unsigned int val;
4705                         if(IS_DIR(mb_type, i, list)){
4706                             if(h->ref_count[list] == 1){
4707                                 val= 0;
4708                             }else if(h->ref_count[list] == 2){
4709                                 val= get_bits1(&s->gb)^1;
4710                             }else{
4711                                 val= get_ue_golomb_31(&s->gb);
4712                                 if(val >= h->ref_count[list]){
4713                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4714                                     return -1;
4715                                 }
4716                             }
4717                         }else
4718                             val= LIST_NOT_USED&0xFF;
4719                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4720                     }
4721             }
4722             for(list=0; list<h->list_count; list++){
4723                 for(i=0; i<2; i++){
4724                     unsigned int val;
4725                     if(IS_DIR(mb_type, i, list)){
4726                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4727                         mx += get_se_golomb(&s->gb);
4728                         my += get_se_golomb(&s->gb);
4729                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4730
4731                         val= pack16to32(mx,my);
4732                     }else
4733                         val=0;
4734                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4735                 }
4736             }
4737         }else{
4738             assert(IS_8X16(mb_type));
4739             for(list=0; list<h->list_count; list++){
4740                     for(i=0; i<2; i++){
4741                         unsigned int val;
4742                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4743                             if(h->ref_count[list]==1){
4744                                 val= 0;
4745                             }else if(h->ref_count[list]==2){
4746                                 val= get_bits1(&s->gb)^1;
4747                             }else{
4748                                 val= get_ue_golomb_31(&s->gb);
4749                                 if(val >= h->ref_count[list]){
4750                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4751                                     return -1;
4752                                 }
4753                             }
4754                         }else
4755                             val= LIST_NOT_USED&0xFF;
4756                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4757                     }
4758             }
4759             for(list=0; list<h->list_count; list++){
4760                 for(i=0; i<2; i++){
4761                     unsigned int val;
4762                     if(IS_DIR(mb_type, i, list)){
4763                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4764                         mx += get_se_golomb(&s->gb);
4765                         my += get_se_golomb(&s->gb);
4766                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4767
4768                         val= pack16to32(mx,my);
4769                     }else
4770                         val=0;
4771                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4772                 }
4773             }
4774         }
4775     }
4776
4777     if(IS_INTER(mb_type))
4778         write_back_motion(h, mb_type);
4779
4780     if(!IS_INTRA16x16(mb_type)){
4781         cbp= get_ue_golomb(&s->gb);
4782         if(cbp > 47){
4783             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4784             return -1;
4785         }
4786
4787         if(CHROMA){
4788             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4789             else                     cbp= golomb_to_inter_cbp   [cbp];
4790         }else{
4791             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4792             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4793         }
4794     }
4795     h->cbp = cbp;
4796
4797     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4798         if(get_bits1(&s->gb)){
4799             mb_type |= MB_TYPE_8x8DCT;
4800             h->cbp_table[mb_xy]= cbp;
4801         }
4802     }
4803     s->current_picture.mb_type[mb_xy]= mb_type;
4804
4805     if(cbp || IS_INTRA16x16(mb_type)){
4806         int i8x8, i4x4, chroma_idx;
4807         int dquant;
4808         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4809         const uint8_t *scan, *scan8x8, *dc_scan;
4810
4811 //        fill_non_zero_count_cache(h);
4812
4813         if(IS_INTERLACED(mb_type)){
4814             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4815             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4816             dc_scan= luma_dc_field_scan;
4817         }else{
4818             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4819             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4820             dc_scan= luma_dc_zigzag_scan;
4821         }
4822
4823         dquant= get_se_golomb(&s->gb);
4824
4825         if( dquant > 25 || dquant < -26 ){
4826             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4827             return -1;
4828         }
4829
4830         s->qscale += dquant;
4831         if(((unsigned)s->qscale) > 51){
4832             if(s->qscale<0) s->qscale+= 52;
4833             else            s->qscale-= 52;
4834         }
4835
4836         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4837         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4838         if(IS_INTRA16x16(mb_type)){
4839             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4840                 return -1; //FIXME continue if partitioned and other return -1 too
4841             }
4842
4843             assert((cbp&15) == 0 || (cbp&15) == 15);
4844
4845             if(cbp&15){
4846                 for(i8x8=0; i8x8<4; i8x8++){
4847                     for(i4x4=0; i4x4<4; i4x4++){
4848                         const int index= i4x4 + 4*i8x8;
4849                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4850                             return -1;
4851                         }
4852                     }
4853                 }
4854             }else{
4855                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4856             }
4857         }else{
4858             for(i8x8=0; i8x8<4; i8x8++){
4859                 if(cbp & (1<<i8x8)){
4860                     if(IS_8x8DCT(mb_type)){
4861                         DCTELEM *buf = &h->mb[64*i8x8];
4862                         uint8_t *nnz;
4863                         for(i4x4=0; i4x4<4; i4x4++){
4864                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4865                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4866                                 return -1;
4867                         }
4868                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4869                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4870                     }else{
4871                         for(i4x4=0; i4x4<4; i4x4++){
4872                             const int index= i4x4 + 4*i8x8;
4873
4874                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4875                                 return -1;
4876                             }
4877                         }
4878                     }
4879                 }else{
4880                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4881                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4882                 }
4883             }
4884         }
4885
4886         if(cbp&0x30){
4887             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4888                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4889                     return -1;
4890                 }
4891         }
4892
4893         if(cbp&0x20){
4894             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4895                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4896                 for(i4x4=0; i4x4<4; i4x4++){
4897                     const int index= 16 + 4*chroma_idx + i4x4;
4898                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4899                         return -1;
4900                     }
4901                 }
4902             }
4903         }else{
4904             uint8_t * const nnz= &h->non_zero_count_cache[0];
4905             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4906             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4907         }
4908     }else{
4909         uint8_t * const nnz= &h->non_zero_count_cache[0];
4910         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4911         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4912         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4913     }
4914     s->current_picture.qscale_table[mb_xy]= s->qscale;
4915     write_back_non_zero_count(h);
4916
4917     if(MB_MBAFF){
4918         h->ref_count[0] >>= 1;
4919         h->ref_count[1] >>= 1;
4920     }
4921
4922     return 0;
4923 }
4924
4925 static int decode_cabac_field_decoding_flag(H264Context *h) {
4926     MpegEncContext * const s = &h->s;
4927     const int mb_x = s->mb_x;
4928     const int mb_y = s->mb_y & ~1;
4929     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4930     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4931
4932     unsigned int ctx = 0;
4933
4934     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4935         ctx += 1;
4936     }
4937     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4938         ctx += 1;
4939     }
4940
4941     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4942 }
4943
4944 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4945     uint8_t *state= &h->cabac_state[ctx_base];
4946     int mb_type;
4947
4948     if(intra_slice){
4949         MpegEncContext * const s = &h->s;
4950         const int mba_xy = h->left_mb_xy[0];
4951         const int mbb_xy = h->top_mb_xy;
4952         int ctx=0;
4953         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4954             ctx++;
4955         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4956             ctx++;
4957         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4958             return 0;   /* I4x4 */
4959         state += 2;
4960     }else{
4961         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4962             return 0;   /* I4x4 */
4963     }
4964
4965     if( get_cabac_terminate( &h->cabac ) )
4966         return 25;  /* PCM */
4967
4968     mb_type = 1; /* I16x16 */
4969     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4970     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4971         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4972     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4973     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4974     return mb_type;
4975 }
4976
4977 static int decode_cabac_mb_type_b( H264Context *h ) {
4978     MpegEncContext * const s = &h->s;
4979
4980         const int mba_xy = h->left_mb_xy[0];
4981         const int mbb_xy = h->top_mb_xy;
4982         int ctx = 0;
4983         int bits;
4984         assert(h->slice_type_nos == FF_B_TYPE);
4985
4986         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4987             ctx++;
4988         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4989             ctx++;
4990
4991         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4992             return 0; /* B_Direct_16x16 */
4993
4994         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4995             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4996         }
4997
4998         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4999         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5000         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5001         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5002         if( bits < 8 )
5003             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5004         else if( bits == 13 ) {
5005             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5006         } else if( bits == 14 )
5007             return 11; /* B_L1_L0_8x16 */
5008         else if( bits == 15 )
5009             return 22; /* B_8x8 */
5010
5011         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5012         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5013 }
5014
5015 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5016     MpegEncContext * const s = &h->s;
5017     int mba_xy, mbb_xy;
5018     int ctx = 0;
5019
5020     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5021         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5022         mba_xy = mb_xy - 1;
5023         if( (mb_y&1)
5024             && h->slice_table[mba_xy] == h->slice_num
5025             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5026             mba_xy += s->mb_stride;
5027         if( MB_FIELD ){
5028             mbb_xy = mb_xy - s->mb_stride;
5029             if( !(mb_y&1)
5030                 && h->slice_table[mbb_xy] == h->slice_num
5031                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5032                 mbb_xy -= s->mb_stride;
5033         }else
5034             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5035     }else{
5036         int mb_xy = h->mb_xy;
5037         mba_xy = mb_xy - 1;
5038         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5039     }
5040
5041     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5042         ctx++;
5043     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5044         ctx++;
5045
5046     if( h->slice_type_nos == FF_B_TYPE )
5047         ctx += 13;
5048     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5049 }
5050
5051 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5052     int mode = 0;
5053
5054     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5055         return pred_mode;
5056
5057     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5058     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5059     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5060
5061     if( mode >= pred_mode )
5062         return mode + 1;
5063     else
5064         return mode;
5065 }
5066
5067 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5068     const int mba_xy = h->left_mb_xy[0];
5069     const int mbb_xy = h->top_mb_xy;
5070
5071     int ctx = 0;
5072
5073     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5074     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5075         ctx++;
5076
5077     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5078         ctx++;
5079
5080     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5081         return 0;
5082
5083     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5084         return 1;
5085     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5086         return 2;
5087     else
5088         return 3;
5089 }
5090
5091 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5092     int cbp_b, cbp_a, ctx, cbp = 0;
5093
5094     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5095     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5096
5097     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5098     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5099     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5100     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5101     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5102     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5103     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5104     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5105     return cbp;
5106 }
5107 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5108     int ctx;
5109     int cbp_a, cbp_b;
5110
5111     cbp_a = (h->left_cbp>>4)&0x03;
5112     cbp_b = (h-> top_cbp>>4)&0x03;
5113
5114     ctx = 0;
5115     if( cbp_a > 0 ) ctx++;
5116     if( cbp_b > 0 ) ctx += 2;
5117     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5118         return 0;
5119
5120     ctx = 4;
5121     if( cbp_a == 2 ) ctx++;
5122     if( cbp_b == 2 ) ctx += 2;
5123     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5124 }
5125 static int decode_cabac_mb_dqp( H264Context *h) {
5126     int   ctx= h->last_qscale_diff != 0;
5127     int   val = 0;
5128
5129     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5130         ctx= 2+(ctx>>1);
5131         val++;
5132         if(val > 102) //prevent infinite loop
5133             return INT_MIN;
5134     }
5135
5136     if( val&0x01 )
5137         return   (val + 1)>>1 ;
5138     else
5139         return -((val + 1)>>1);
5140 }
5141 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5142     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5143         return 0;   /* 8x8 */
5144     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5145         return 1;   /* 8x4 */
5146     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5147         return 2;   /* 4x8 */
5148     return 3;       /* 4x4 */
5149 }
5150 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5151     int type;
5152     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5153         return 0;   /* B_Direct_8x8 */
5154     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5155         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5156     type = 3;
5157     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5158         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5159             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5160         type += 4;
5161     }
5162     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5163     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5164     return type;
5165 }
5166
5167 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5168     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5169 }
5170
5171 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5172     int refa = h->ref_cache[list][scan8[n] - 1];
5173     int refb = h->ref_cache[list][scan8[n] - 8];
5174     int ref  = 0;
5175     int ctx  = 0;
5176
5177     if( h->slice_type_nos == FF_B_TYPE) {
5178         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5179             ctx++;
5180         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5181             ctx += 2;
5182     } else {
5183         if( refa > 0 )
5184             ctx++;
5185         if( refb > 0 )
5186             ctx += 2;
5187     }
5188
5189     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5190         ref++;
5191         ctx = (ctx>>2)+4;
5192         if(ref >= 32 /*h->ref_list[list]*/){
5193             return -1;
5194         }
5195     }
5196     return ref;
5197 }
5198
5199 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5200     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5201                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5202     int ctxbase = (l == 0) ? 40 : 47;
5203     int mvd;
5204     int ctx = (amvd>2) + (amvd>32);
5205
5206     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5207         return 0;
5208
5209     mvd= 1;
5210     ctx= 3;
5211     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5212         mvd++;
5213         if( ctx < 6 )
5214             ctx++;
5215     }
5216
5217     if( mvd >= 9 ) {
5218         int k = 3;
5219         while( get_cabac_bypass( &h->cabac ) ) {
5220             mvd += 1 << k;
5221             k++;
5222             if(k>24){
5223                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5224                 return INT_MIN;
5225             }
5226         }
5227         while( k-- ) {
5228             if( get_cabac_bypass( &h->cabac ) )
5229                 mvd += 1 << k;
5230         }
5231     }
5232     return get_cabac_bypass_sign( &h->cabac, -mvd );
5233 }
5234
5235 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5236     int nza, nzb;
5237     int ctx = 0;
5238
5239     if( is_dc ) {
5240         if( cat == 0 ) {
5241             nza = h->left_cbp&0x100;
5242             nzb = h-> top_cbp&0x100;
5243         } else {
5244             nza = (h->left_cbp>>(6+idx))&0x01;
5245             nzb = (h-> top_cbp>>(6+idx))&0x01;
5246         }
5247     } else {
5248         assert(cat == 1 || cat == 2 || cat == 4);
5249         nza = h->non_zero_count_cache[scan8[idx] - 1];
5250         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5251     }
5252
5253     if( nza > 0 )
5254         ctx++;
5255
5256     if( nzb > 0 )
5257         ctx += 2;
5258
5259     return ctx + 4 * cat;
5260 }
5261
5262 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5263     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5264     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5265     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5266     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5267 };
5268
5269 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5270     static const int significant_coeff_flag_offset[2][6] = {
5271       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5272       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5273     };
5274     static const int last_coeff_flag_offset[2][6] = {
5275       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5276       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5277     };
5278     static const int coeff_abs_level_m1_offset[6] = {
5279         227+0, 227+10, 227+20, 227+30, 227+39, 426
5280     };
5281     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5282       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5283         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5284         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5285        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5286       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5287         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5288         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5289         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5290     };
5291     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5292      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5293      * map node ctx => cabac ctx for level=1 */
5294     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5295     /* map node ctx => cabac ctx for level>1 */
5296     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5297     static const uint8_t coeff_abs_level_transition[2][8] = {
5298     /* update node ctx after decoding a level=1 */
5299         { 1, 2, 3, 3, 4, 5, 6, 7 },
5300     /* update node ctx after decoding a level>1 */
5301         { 4, 4, 4, 4, 5, 6, 7, 7 }
5302     };
5303
5304     int index[64];
5305
5306     int av_unused last;
5307     int coeff_count = 0;
5308     int node_ctx = 0;
5309
5310     uint8_t *significant_coeff_ctx_base;
5311     uint8_t *last_coeff_ctx_base;
5312     uint8_t *abs_level_m1_ctx_base;
5313
5314 #if !ARCH_X86
5315 #define CABAC_ON_STACK
5316 #endif
5317 #ifdef CABAC_ON_STACK
5318 #define CC &cc
5319     CABACContext cc;
5320     cc.range     = h->cabac.range;
5321     cc.low       = h->cabac.low;
5322     cc.bytestream= h->cabac.bytestream;
5323 #else
5324 #define CC &h->cabac
5325 #endif
5326
5327
5328     /* cat: 0-> DC 16x16  n = 0
5329      *      1-> AC 16x16  n = luma4x4idx
5330      *      2-> Luma4x4   n = luma4x4idx
5331      *      3-> DC Chroma n = iCbCr
5332      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5333      *      5-> Luma8x8   n = 4 * luma8x8idx
5334      */
5335
5336     /* read coded block flag */
5337     if( is_dc || cat != 5 ) {
5338         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5339             if( !is_dc )
5340                 h->non_zero_count_cache[scan8[n]] = 0;
5341
5342 #ifdef CABAC_ON_STACK
5343             h->cabac.range     = cc.range     ;
5344             h->cabac.low       = cc.low       ;
5345             h->cabac.bytestream= cc.bytestream;
5346 #endif
5347             return;
5348         }
5349     }
5350
5351     significant_coeff_ctx_base = h->cabac_state
5352         + significant_coeff_flag_offset[MB_FIELD][cat];
5353     last_coeff_ctx_base = h->cabac_state
5354         + last_coeff_flag_offset[MB_FIELD][cat];
5355     abs_level_m1_ctx_base = h->cabac_state
5356         + coeff_abs_level_m1_offset[cat];
5357
5358     if( !is_dc && cat == 5 ) {
5359 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5360         for(last= 0; last < coefs; last++) { \
5361             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5362             if( get_cabac( CC, sig_ctx )) { \
5363                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5364                 index[coeff_count++] = last; \
5365                 if( get_cabac( CC, last_ctx ) ) { \
5366                     last= max_coeff; \
5367                     break; \
5368                 } \
5369             } \
5370         }\
5371         if( last == max_coeff -1 ) {\
5372             index[coeff_count++] = last;\
5373         }
5374         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5375 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5376         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5377     } else {
5378         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5379 #else
5380         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5381     } else {
5382         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5383 #endif
5384     }
5385     assert(coeff_count > 0);
5386
5387     if( is_dc ) {
5388         if( cat == 0 )
5389             h->cbp_table[h->mb_xy] |= 0x100;
5390         else
5391             h->cbp_table[h->mb_xy] |= 0x40 << n;
5392     } else {
5393         if( cat == 5 )
5394             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5395         else {
5396             assert( cat == 1 || cat == 2 || cat == 4 );
5397             h->non_zero_count_cache[scan8[n]] = coeff_count;
5398         }
5399     }
5400
5401     do {
5402         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5403
5404         int j= scantable[index[--coeff_count]];
5405
5406         if( get_cabac( CC, ctx ) == 0 ) {
5407             node_ctx = coeff_abs_level_transition[0][node_ctx];
5408             if( is_dc ) {
5409                 block[j] = get_cabac_bypass_sign( CC, -1);
5410             }else{
5411                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5412             }
5413         } else {
5414             int coeff_abs = 2;
5415             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5416             node_ctx = coeff_abs_level_transition[1][node_ctx];
5417
5418             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5419                 coeff_abs++;
5420             }
5421
5422             if( coeff_abs >= 15 ) {
5423                 int j = 0;
5424                 while( get_cabac_bypass( CC ) ) {
5425                     j++;
5426                 }
5427
5428                 coeff_abs=1;
5429                 while( j-- ) {
5430                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5431                 }
5432                 coeff_abs+= 14;
5433             }
5434
5435             if( is_dc ) {
5436                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5437             }else{
5438                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5439             }
5440         }
5441     } while( coeff_count );
5442 #ifdef CABAC_ON_STACK
5443             h->cabac.range     = cc.range     ;
5444             h->cabac.low       = cc.low       ;
5445             h->cabac.bytestream= cc.bytestream;
5446 #endif
5447
5448 }
5449
5450 #if !CONFIG_SMALL
5451 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5452     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5453 }
5454
5455 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5456     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5457 }
5458 #endif
5459
5460 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5461 #if CONFIG_SMALL
5462     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5463 #else
5464     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5465     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5466 #endif
5467 }
5468
5469 static inline void compute_mb_neighbors(H264Context *h)
5470 {
5471     MpegEncContext * const s = &h->s;
5472     const int mb_xy  = h->mb_xy;
5473     h->top_mb_xy     = mb_xy - s->mb_stride;
5474     h->left_mb_xy[0] = mb_xy - 1;
5475     if(FRAME_MBAFF){
5476         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5477         const int top_pair_xy      = pair_xy     - s->mb_stride;
5478         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5479         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5480         const int curr_mb_field_flag = MB_FIELD;
5481         const int bottom = (s->mb_y & 1);
5482
5483         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5484             h->top_mb_xy -= s->mb_stride;
5485         }
5486         if (!left_mb_field_flag == curr_mb_field_flag) {
5487             h->left_mb_xy[0] = pair_xy - 1;
5488         }
5489     } else if (FIELD_PICTURE) {
5490         h->top_mb_xy -= s->mb_stride;
5491     }
5492     return;
5493 }
5494
5495 /**
5496  * decodes a macroblock
5497  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5498  */
5499 static int decode_mb_cabac(H264Context *h) {
5500     MpegEncContext * const s = &h->s;
5501     int mb_xy;
5502     int mb_type, partition_count, cbp = 0;
5503     int dct8x8_allowed= h->pps.transform_8x8_mode;
5504
5505     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5506
5507     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5508     if( h->slice_type_nos != FF_I_TYPE ) {
5509         int skip;
5510         /* a skipped mb needs the aff flag from the following mb */
5511         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5512             predict_field_decoding_flag(h);
5513         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5514             skip = h->next_mb_skipped;
5515         else
5516             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5517         /* read skip flags */
5518         if( skip ) {
5519             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5520                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5521                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5522                 if(!h->next_mb_skipped)
5523                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5524             }
5525
5526             decode_mb_skip(h);
5527
5528             h->cbp_table[mb_xy] = 0;
5529             h->chroma_pred_mode_table[mb_xy] = 0;
5530             h->last_qscale_diff = 0;
5531
5532             return 0;
5533
5534         }
5535     }
5536     if(FRAME_MBAFF){
5537         if( (s->mb_y&1) == 0 )
5538             h->mb_mbaff =
5539             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5540     }
5541
5542     h->prev_mb_skipped = 0;
5543
5544     compute_mb_neighbors(h);
5545
5546     if( h->slice_type_nos == FF_B_TYPE ) {
5547         mb_type = decode_cabac_mb_type_b( h );
5548         if( mb_type < 23 ){
5549             partition_count= b_mb_type_info[mb_type].partition_count;
5550             mb_type=         b_mb_type_info[mb_type].type;
5551         }else{
5552             mb_type -= 23;
5553             goto decode_intra_mb;
5554         }
5555     } else if( h->slice_type_nos == FF_P_TYPE ) {
5556         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5557             /* P-type */
5558             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5559                 /* P_L0_D16x16, P_8x8 */
5560                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5561             } else {
5562                 /* P_L0_D8x16, P_L0_D16x8 */
5563                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5564             }
5565             partition_count= p_mb_type_info[mb_type].partition_count;
5566             mb_type=         p_mb_type_info[mb_type].type;
5567         } else {
5568             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5569             goto decode_intra_mb;
5570         }
5571     } else {
5572         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5573         if(h->slice_type == FF_SI_TYPE && mb_type)
5574             mb_type--;
5575         assert(h->slice_type_nos == FF_I_TYPE);
5576 decode_intra_mb:
5577         partition_count = 0;
5578         cbp= i_mb_type_info[mb_type].cbp;
5579         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5580         mb_type= i_mb_type_info[mb_type].type;
5581     }
5582     if(MB_FIELD)
5583         mb_type |= MB_TYPE_INTERLACED;
5584
5585     h->slice_table[ mb_xy ]= h->slice_num;
5586
5587     if(IS_INTRA_PCM(mb_type)) {
5588         const uint8_t *ptr;
5589
5590         // We assume these blocks are very rare so we do not optimize it.
5591         // FIXME The two following lines get the bitstream position in the cabac
5592         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5593         ptr= h->cabac.bytestream;
5594         if(h->cabac.low&0x1) ptr--;
5595         if(CABAC_BITS==16){
5596             if(h->cabac.low&0x1FF) ptr--;
5597         }
5598
5599         // The pixels are stored in the same order as levels in h->mb array.
5600         memcpy(h->mb, ptr, 256); ptr+=256;
5601         if(CHROMA){
5602             memcpy(h->mb+128, ptr, 128); ptr+=128;
5603         }
5604
5605         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5606
5607         // All blocks are present
5608         h->cbp_table[mb_xy] = 0x1ef;
5609         h->chroma_pred_mode_table[mb_xy] = 0;
5610         // In deblocking, the quantizer is 0
5611         s->current_picture.qscale_table[mb_xy]= 0;
5612         // All coeffs are present
5613         memset(h->non_zero_count[mb_xy], 16, 16);
5614         s->current_picture.mb_type[mb_xy]= mb_type;
5615         h->last_qscale_diff = 0;
5616         return 0;
5617     }
5618
5619     if(MB_MBAFF){
5620         h->ref_count[0] <<= 1;
5621         h->ref_count[1] <<= 1;
5622     }
5623
5624     fill_caches(h, mb_type, 0);
5625
5626     if( IS_INTRA( mb_type ) ) {
5627         int i, pred_mode;
5628         if( IS_INTRA4x4( mb_type ) ) {
5629             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5630                 mb_type |= MB_TYPE_8x8DCT;
5631                 for( i = 0; i < 16; i+=4 ) {
5632                     int pred = pred_intra_mode( h, i );
5633                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5634                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5635                 }
5636             } else {
5637                 for( i = 0; i < 16; i++ ) {
5638                     int pred = pred_intra_mode( h, i );
5639                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5640
5641                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5642                 }
5643             }
5644             write_back_intra_pred_mode(h);
5645             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5646         } else {
5647             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5648             if( h->intra16x16_pred_mode < 0 ) return -1;
5649         }
5650         if(CHROMA){
5651             h->chroma_pred_mode_table[mb_xy] =
5652             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5653
5654             pred_mode= check_intra_pred_mode( h, pred_mode );
5655             if( pred_mode < 0 ) return -1;
5656             h->chroma_pred_mode= pred_mode;
5657         }
5658     } else if( partition_count == 4 ) {
5659         int i, j, sub_partition_count[4], list, ref[2][4];
5660
5661         if( h->slice_type_nos == FF_B_TYPE ) {
5662             for( i = 0; i < 4; i++ ) {
5663                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5664                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5665                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5666             }
5667             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5668                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5669                 pred_direct_motion(h, &mb_type);
5670                 h->ref_cache[0][scan8[4]] =
5671                 h->ref_cache[1][scan8[4]] =
5672                 h->ref_cache[0][scan8[12]] =
5673                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5674                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5675                     for( i = 0; i < 4; i++ )
5676                         if( IS_DIRECT(h->sub_mb_type[i]) )
5677                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5678                 }
5679             }
5680         } else {
5681             for( i = 0; i < 4; i++ ) {
5682                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5683                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5684                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5685             }
5686         }
5687
5688         for( list = 0; list < h->list_count; list++ ) {
5689                 for( i = 0; i < 4; i++ ) {
5690                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5691                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5692                         if( h->ref_count[list] > 1 ){
5693                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5694                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5695                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5696                                 return -1;
5697                             }
5698                         }else
5699                             ref[list][i] = 0;
5700                     } else {
5701                         ref[list][i] = -1;
5702                     }
5703                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5704                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5705                 }
5706         }
5707
5708         if(dct8x8_allowed)
5709             dct8x8_allowed = get_dct8x8_allowed(h);
5710
5711         for(list=0; list<h->list_count; list++){
5712             for(i=0; i<4; i++){
5713                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5714                 if(IS_DIRECT(h->sub_mb_type[i])){
5715                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5716                     continue;
5717                 }
5718
5719                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5720                     const int sub_mb_type= h->sub_mb_type[i];
5721                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5722                     for(j=0; j<sub_partition_count[i]; j++){
5723                         int mpx, mpy;
5724                         int mx, my;
5725                         const int index= 4*i + block_width*j;
5726                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5727                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5728                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5729
5730                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5731                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5732                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5733
5734                         if(IS_SUB_8X8(sub_mb_type)){
5735                             mv_cache[ 1 ][0]=
5736                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5737                             mv_cache[ 1 ][1]=
5738                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5739
5740                             mvd_cache[ 1 ][0]=
5741                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5742                             mvd_cache[ 1 ][1]=
5743                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5744                         }else if(IS_SUB_8X4(sub_mb_type)){
5745                             mv_cache[ 1 ][0]= mx;
5746                             mv_cache[ 1 ][1]= my;
5747
5748                             mvd_cache[ 1 ][0]= mx - mpx;
5749                             mvd_cache[ 1 ][1]= my - mpy;
5750                         }else if(IS_SUB_4X8(sub_mb_type)){
5751                             mv_cache[ 8 ][0]= mx;
5752                             mv_cache[ 8 ][1]= my;
5753
5754                             mvd_cache[ 8 ][0]= mx - mpx;
5755                             mvd_cache[ 8 ][1]= my - mpy;
5756                         }
5757                         mv_cache[ 0 ][0]= mx;
5758                         mv_cache[ 0 ][1]= my;
5759
5760                         mvd_cache[ 0 ][0]= mx - mpx;
5761                         mvd_cache[ 0 ][1]= my - mpy;
5762                     }
5763                 }else{
5764                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5765                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5766                     p[0] = p[1] = p[8] = p[9] = 0;
5767                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5768                 }
5769             }
5770         }
5771     } else if( IS_DIRECT(mb_type) ) {
5772         pred_direct_motion(h, &mb_type);
5773         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5774         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5775         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5776     } else {
5777         int list, mx, my, i, mpx, mpy;
5778         if(IS_16X16(mb_type)){
5779             for(list=0; list<h->list_count; list++){
5780                 if(IS_DIR(mb_type, 0, list)){
5781                     int ref;
5782                     if(h->ref_count[list] > 1){
5783                         ref= decode_cabac_mb_ref(h, list, 0);
5784                         if(ref >= (unsigned)h->ref_count[list]){
5785                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5786                             return -1;
5787                         }
5788                     }else
5789                         ref=0;
5790                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5791                 }else
5792                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5793             }
5794             for(list=0; list<h->list_count; list++){
5795                 if(IS_DIR(mb_type, 0, list)){
5796                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5797
5798                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5799                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5800                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5801
5802                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5803                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5804                 }else
5805                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5806             }
5807         }
5808         else if(IS_16X8(mb_type)){
5809             for(list=0; list<h->list_count; list++){
5810                     for(i=0; i<2; i++){
5811                         if(IS_DIR(mb_type, i, list)){
5812                             int ref;
5813                             if(h->ref_count[list] > 1){
5814                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5815                                 if(ref >= (unsigned)h->ref_count[list]){
5816                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5817                                     return -1;
5818                                 }
5819                             }else
5820                                 ref=0;
5821                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5822                         }else
5823                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5824                     }
5825             }
5826             for(list=0; list<h->list_count; list++){
5827                 for(i=0; i<2; i++){
5828                     if(IS_DIR(mb_type, i, list)){
5829                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5830                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5831                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5832                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5833
5834                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5835                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5836                     }else{
5837                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5838                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5839                     }
5840                 }
5841             }
5842         }else{
5843             assert(IS_8X16(mb_type));
5844             for(list=0; list<h->list_count; list++){
5845                     for(i=0; i<2; i++){
5846                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5847                             int ref;
5848                             if(h->ref_count[list] > 1){
5849                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5850                                 if(ref >= (unsigned)h->ref_count[list]){
5851                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5852                                     return -1;
5853                                 }
5854                             }else
5855                                 ref=0;
5856                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5857                         }else
5858                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5859                     }
5860             }
5861             for(list=0; list<h->list_count; list++){
5862                 for(i=0; i<2; i++){
5863                     if(IS_DIR(mb_type, i, list)){
5864                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5865                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5866                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5867
5868                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5869                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5870                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5871                     }else{
5872                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5873                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5874                     }
5875                 }
5876             }
5877         }
5878     }
5879
5880    if( IS_INTER( mb_type ) ) {
5881         h->chroma_pred_mode_table[mb_xy] = 0;
5882         write_back_motion( h, mb_type );
5883    }
5884
5885     if( !IS_INTRA16x16( mb_type ) ) {
5886         cbp  = decode_cabac_mb_cbp_luma( h );
5887         if(CHROMA)
5888             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5889     }
5890
5891     h->cbp_table[mb_xy] = h->cbp = cbp;
5892
5893     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5894         if( decode_cabac_mb_transform_size( h ) )
5895             mb_type |= MB_TYPE_8x8DCT;
5896     }
5897     s->current_picture.mb_type[mb_xy]= mb_type;
5898
5899     if( cbp || IS_INTRA16x16( mb_type ) ) {
5900         const uint8_t *scan, *scan8x8, *dc_scan;
5901         const uint32_t *qmul;
5902         int dqp;
5903
5904         if(IS_INTERLACED(mb_type)){
5905             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5906             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5907             dc_scan= luma_dc_field_scan;
5908         }else{
5909             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5910             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5911             dc_scan= luma_dc_zigzag_scan;
5912         }
5913
5914         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5915         if( dqp == INT_MIN ){
5916             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5917             return -1;
5918         }
5919         s->qscale += dqp;
5920         if(((unsigned)s->qscale) > 51){
5921             if(s->qscale<0) s->qscale+= 52;
5922             else            s->qscale-= 52;
5923         }
5924         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5925         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5926
5927         if( IS_INTRA16x16( mb_type ) ) {
5928             int i;
5929             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5930             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5931
5932             if( cbp&15 ) {
5933                 qmul = h->dequant4_coeff[0][s->qscale];
5934                 for( i = 0; i < 16; i++ ) {
5935                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5936                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5937                 }
5938             } else {
5939                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5940             }
5941         } else {
5942             int i8x8, i4x4;
5943             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5944                 if( cbp & (1<<i8x8) ) {
5945                     if( IS_8x8DCT(mb_type) ) {
5946                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5947                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5948                     } else {
5949                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5950                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5951                             const int index = 4*i8x8 + i4x4;
5952                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5953 //START_TIMER
5954                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5955 //STOP_TIMER("decode_residual")
5956                         }
5957                     }
5958                 } else {
5959                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5960                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5961                 }
5962             }
5963         }
5964
5965         if( cbp&0x30 ){
5966             int c;
5967             for( c = 0; c < 2; c++ ) {
5968                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5969                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5970             }
5971         }
5972
5973         if( cbp&0x20 ) {
5974             int c, i;
5975             for( c = 0; c < 2; c++ ) {
5976                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5977                 for( i = 0; i < 4; i++ ) {
5978                     const int index = 16 + 4 * c + i;
5979                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5980                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5981                 }
5982             }
5983         } else {
5984             uint8_t * const nnz= &h->non_zero_count_cache[0];
5985             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5986             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5987         }
5988     } else {
5989         uint8_t * const nnz= &h->non_zero_count_cache[0];
5990         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5991         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5992         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5993         h->last_qscale_diff = 0;
5994     }
5995
5996     s->current_picture.qscale_table[mb_xy]= s->qscale;
5997     write_back_non_zero_count(h);
5998
5999     if(MB_MBAFF){
6000         h->ref_count[0] >>= 1;
6001         h->ref_count[1] >>= 1;
6002     }
6003
6004     return 0;
6005 }
6006
6007
6008 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6009     const int index_a = qp + h->slice_alpha_c0_offset;
6010     const int alpha = (alpha_table+52)[index_a];
6011     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6012     if (alpha ==0 || beta == 0) return;
6013
6014     if( bS[0] < 4 ) {
6015         int8_t tc[4];
6016         tc[0] = (tc0_table+52)[index_a][bS[0]];
6017         tc[1] = (tc0_table+52)[index_a][bS[1]];
6018         tc[2] = (tc0_table+52)[index_a][bS[2]];
6019         tc[3] = (tc0_table+52)[index_a][bS[3]];
6020         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6021     } else {
6022         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6023     }
6024 }
6025 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6026     const int index_a = qp + h->slice_alpha_c0_offset;
6027     const int alpha = (alpha_table+52)[index_a];
6028     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6029     if (alpha ==0 || beta == 0) return;
6030
6031     if( bS[0] < 4 ) {
6032         int8_t tc[4];
6033         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6034         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6035         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6036         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6037         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6038     } else {
6039         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6040     }
6041 }
6042
6043 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6044     int i;
6045     for( i = 0; i < 16; i++, pix += stride) {
6046         int index_a;
6047         int alpha;
6048         int beta;
6049
6050         int qp_index;
6051         int bS_index = (i >> 1);
6052         if (!MB_FIELD) {
6053             bS_index &= ~1;
6054             bS_index |= (i & 1);
6055         }
6056
6057         if( bS[bS_index] == 0 ) {
6058             continue;
6059         }
6060
6061         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6062         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6063         alpha = (alpha_table+52)[index_a];
6064         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6065
6066         if( bS[bS_index] < 4 ) {
6067             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6068             const int p0 = pix[-1];
6069             const int p1 = pix[-2];
6070             const int p2 = pix[-3];
6071             const int q0 = pix[0];
6072             const int q1 = pix[1];
6073             const int q2 = pix[2];
6074
6075             if( FFABS( p0 - q0 ) < alpha &&
6076                 FFABS( p1 - p0 ) < beta &&
6077                 FFABS( q1 - q0 ) < beta ) {
6078                 int tc = tc0;
6079                 int i_delta;
6080
6081                 if( FFABS( p2 - p0 ) < beta ) {
6082                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6083                     tc++;
6084                 }
6085                 if( FFABS( q2 - q0 ) < beta ) {
6086                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6087                     tc++;
6088                 }
6089
6090                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6091                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6092                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6093                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6094             }
6095         }else{
6096             const int p0 = pix[-1];
6097             const int p1 = pix[-2];
6098             const int p2 = pix[-3];
6099
6100             const int q0 = pix[0];
6101             const int q1 = pix[1];
6102             const int q2 = pix[2];
6103
6104             if( FFABS( p0 - q0 ) < alpha &&
6105                 FFABS( p1 - p0 ) < beta &&
6106                 FFABS( q1 - q0 ) < beta ) {
6107
6108                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6109                     if( FFABS( p2 - p0 ) < beta)
6110                     {
6111                         const int p3 = pix[-4];
6112                         /* p0', p1', p2' */
6113                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6114                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6115                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6116                     } else {
6117                         /* p0' */
6118                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6119                     }
6120                     if( FFABS( q2 - q0 ) < beta)
6121                     {
6122                         const int q3 = pix[3];
6123                         /* q0', q1', q2' */
6124                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6125                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6126                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6127                     } else {
6128                         /* q0' */
6129                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6130                     }
6131                 }else{
6132                     /* p0', q0' */
6133                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6134                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6135                 }
6136                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6137             }
6138         }
6139     }
6140 }
6141 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6142     int i;
6143     for( i = 0; i < 8; i++, pix += stride) {
6144         int index_a;
6145         int alpha;
6146         int beta;
6147
6148         int qp_index;
6149         int bS_index = i;
6150
6151         if( bS[bS_index] == 0 ) {
6152             continue;
6153         }
6154
6155         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6156         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6157         alpha = (alpha_table+52)[index_a];
6158         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6159
6160         if( bS[bS_index] < 4 ) {
6161             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6162             const int p0 = pix[-1];
6163             const int p1 = pix[-2];
6164             const int q0 = pix[0];
6165             const int q1 = pix[1];
6166
6167             if( FFABS( p0 - q0 ) < alpha &&
6168                 FFABS( p1 - p0 ) < beta &&
6169                 FFABS( q1 - q0 ) < beta ) {
6170                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6171
6172                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6173                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6174                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6175             }
6176         }else{
6177             const int p0 = pix[-1];
6178             const int p1 = pix[-2];
6179             const int q0 = pix[0];
6180             const int q1 = pix[1];
6181
6182             if( FFABS( p0 - q0 ) < alpha &&
6183                 FFABS( p1 - p0 ) < beta &&
6184                 FFABS( q1 - q0 ) < beta ) {
6185
6186                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6187                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6188                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6189             }
6190         }
6191     }
6192 }
6193
6194 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6195     const int index_a = qp + h->slice_alpha_c0_offset;
6196     const int alpha = (alpha_table+52)[index_a];
6197     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6198     if (alpha ==0 || beta == 0) return;
6199
6200     if( bS[0] < 4 ) {
6201         int8_t tc[4];
6202         tc[0] = (tc0_table+52)[index_a][bS[0]];
6203         tc[1] = (tc0_table+52)[index_a][bS[1]];
6204         tc[2] = (tc0_table+52)[index_a][bS[2]];
6205         tc[3] = (tc0_table+52)[index_a][bS[3]];
6206         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6207     } else {
6208         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6209     }
6210 }
6211
6212 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6213     const int index_a = qp + h->slice_alpha_c0_offset;
6214     const int alpha = (alpha_table+52)[index_a];
6215     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6216     if (alpha ==0 || beta == 0) return;
6217
6218     if( bS[0] < 4 ) {
6219         int8_t tc[4];
6220         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6221         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6222         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6223         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6224         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6225     } else {
6226         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6227     }
6228 }
6229
6230 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6231     MpegEncContext * const s = &h->s;
6232     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6233     int mb_xy, mb_type;
6234     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6235
6236     mb_xy = h->mb_xy;
6237
6238     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6239         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6240        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6241                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6242         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6243         return;
6244     }
6245     assert(!FRAME_MBAFF);
6246
6247     mb_type = s->current_picture.mb_type[mb_xy];
6248     qp = s->current_picture.qscale_table[mb_xy];
6249     qp0 = s->current_picture.qscale_table[mb_xy-1];
6250     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6251     qpc = get_chroma_qp( h, 0, qp );
6252     qpc0 = get_chroma_qp( h, 0, qp0 );
6253     qpc1 = get_chroma_qp( h, 0, qp1 );
6254     qp0 = (qp + qp0 + 1) >> 1;
6255     qp1 = (qp + qp1 + 1) >> 1;
6256     qpc0 = (qpc + qpc0 + 1) >> 1;
6257     qpc1 = (qpc + qpc1 + 1) >> 1;
6258     qp_thresh = 15 - h->slice_alpha_c0_offset;
6259     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6260        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6261         return;
6262
6263     if( IS_INTRA(mb_type) ) {
6264         int16_t bS4[4] = {4,4,4,4};
6265         int16_t bS3[4] = {3,3,3,3};
6266         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6267         if( IS_8x8DCT(mb_type) ) {
6268             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6269             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6270             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6271             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6272         } else {
6273             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6274             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6275             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6276             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6277             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6278             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6279             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6280             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6281         }
6282         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6283         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6284         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6285         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6286         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6287         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6288         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6289         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6290         return;
6291     } else {
6292         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6293         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6294         int edges;
6295         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6296             edges = 4;
6297             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6298         } else {
6299             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6300                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6301             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6302                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6303                              ? 3 : 0;
6304             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6305             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6306             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6307                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6308         }
6309         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6310             bSv[0][0] = 0x0004000400040004ULL;
6311         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6312             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6313
6314 #define FILTER(hv,dir,edge)\
6315         if(bSv[dir][edge]) {\
6316             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6317             if(!(edge&1)) {\
6318                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6319                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6320             }\
6321         }
6322         if( edges == 1 ) {
6323             FILTER(v,0,0);
6324             FILTER(h,1,0);
6325         } else if( IS_8x8DCT(mb_type) ) {
6326             FILTER(v,0,0);
6327             FILTER(v,0,2);
6328             FILTER(h,1,0);
6329             FILTER(h,1,2);
6330         } else {
6331             FILTER(v,0,0);
6332             FILTER(v,0,1);
6333             FILTER(v,0,2);
6334             FILTER(v,0,3);
6335             FILTER(h,1,0);
6336             FILTER(h,1,1);
6337             FILTER(h,1,2);
6338             FILTER(h,1,3);
6339         }
6340 #undef FILTER
6341     }
6342 }
6343
6344
6345 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6346     MpegEncContext * const s = &h->s;
6347     int edge;
6348     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6349     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6350     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6351     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6352     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6353
6354     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6355                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6356     // how often to recheck mv-based bS when iterating between edges
6357     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6358                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6359     // how often to recheck mv-based bS when iterating along each edge
6360     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6361
6362     if (first_vertical_edge_done) {
6363         start = 1;
6364     }
6365
6366     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6367         start = 1;
6368
6369     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6370         && !IS_INTERLACED(mb_type)
6371         && IS_INTERLACED(mbm_type)
6372         ) {
6373         // This is a special case in the norm where the filtering must
6374         // be done twice (one each of the field) even if we are in a
6375         // frame macroblock.
6376         //
6377         static const int nnz_idx[4] = {4,5,6,3};
6378         unsigned int tmp_linesize   = 2 *   linesize;
6379         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6380         int mbn_xy = mb_xy - 2 * s->mb_stride;
6381         int qp;
6382         int i, j;
6383         int16_t bS[4];
6384
6385         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6386             if( IS_INTRA(mb_type) ||
6387                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6388                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6389             } else {
6390                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6391                 for( i = 0; i < 4; i++ ) {
6392                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6393                         mbn_nnz[nnz_idx[i]] != 0 )
6394                         bS[i] = 2;
6395                     else
6396                         bS[i] = 1;
6397                 }
6398             }
6399             // Do not use s->qscale as luma quantizer because it has not the same
6400             // value in IPCM macroblocks.
6401             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6402             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6403             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6404             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6405             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6406                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6407             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6408                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6409         }
6410
6411         start = 1;
6412     }
6413
6414     /* Calculate bS */
6415     for( edge = start; edge < edges; edge++ ) {
6416         /* mbn_xy: neighbor macroblock */
6417         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6418         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6419         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6420         int16_t bS[4];
6421         int qp;
6422
6423         if( (edge&1) && IS_8x8DCT(mb_type) )
6424             continue;
6425
6426         if( IS_INTRA(mb_type) ||
6427             IS_INTRA(mbn_type) ) {
6428             int value;
6429             if (edge == 0) {
6430                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6431                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6432                 ) {
6433                     value = 4;
6434                 } else {
6435                     value = 3;
6436                 }
6437             } else {
6438                 value = 3;
6439             }
6440             bS[0] = bS[1] = bS[2] = bS[3] = value;
6441         } else {
6442             int i, l;
6443             int mv_done;
6444
6445             if( edge & mask_edge ) {
6446                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6447                 mv_done = 1;
6448             }
6449             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6450                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6451                 mv_done = 1;
6452             }
6453             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6454                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6455                 int bn_idx= b_idx - (dir ? 8:1);
6456                 int v = 0;
6457
6458                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6459                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6460                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6461                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6462                 }
6463
6464                 if(h->slice_type_nos == FF_B_TYPE && v){
6465                     v=0;
6466                     for( l = 0; !v && l < 2; l++ ) {
6467                         int ln= 1-l;
6468                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6469                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6470                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6471                     }
6472                 }
6473
6474                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6475                 mv_done = 1;
6476             }
6477             else
6478                 mv_done = 0;
6479
6480             for( i = 0; i < 4; i++ ) {
6481                 int x = dir == 0 ? edge : i;
6482                 int y = dir == 0 ? i    : edge;
6483                 int b_idx= 8 + 4 + x + 8*y;
6484                 int bn_idx= b_idx - (dir ? 8:1);
6485
6486                 if( h->non_zero_count_cache[b_idx] |
6487                     h->non_zero_count_cache[bn_idx] ) {
6488                     bS[i] = 2;
6489                 }
6490                 else if(!mv_done)
6491                 {
6492                     bS[i] = 0;
6493                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6494                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6495                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6496                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6497                             bS[i] = 1;
6498                             break;
6499                         }
6500                     }
6501
6502                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6503                         bS[i] = 0;
6504                         for( l = 0; l < 2; l++ ) {
6505                             int ln= 1-l;
6506                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6507                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6508                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6509                                 bS[i] = 1;
6510                                 break;
6511                             }
6512                         }
6513                     }
6514                 }
6515             }
6516
6517             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6518                 continue;
6519         }
6520
6521         /* Filter edge */
6522         // Do not use s->qscale as luma quantizer because it has not the same
6523         // value in IPCM macroblocks.
6524         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6525         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6526         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6527         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6528         if( dir == 0 ) {
6529             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6530             if( (edge&1) == 0 ) {
6531                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6532                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6533                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6534                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6535             }
6536         } else {
6537             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6538             if( (edge&1) == 0 ) {
6539                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6540                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6541                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6542                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6543             }
6544         }
6545     }
6546 }
6547
6548 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6549     MpegEncContext * const s = &h->s;
6550     const int mb_xy= mb_x + mb_y*s->mb_stride;
6551     const int mb_type = s->current_picture.mb_type[mb_xy];
6552     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6553     int first_vertical_edge_done = 0;
6554     av_unused int dir;
6555
6556     //for sufficiently low qp, filtering wouldn't do anything
6557     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6558     if(!FRAME_MBAFF){
6559         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6560         int qp = s->current_picture.qscale_table[mb_xy];
6561         if(qp <= qp_thresh
6562            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6563            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6564             return;
6565         }
6566     }
6567
6568     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6569     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6570         int top_type, left_type[2];
6571         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6572         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6573         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6574
6575         if(IS_8x8DCT(top_type)){
6576             h->non_zero_count_cache[4+8*0]=
6577             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6578             h->non_zero_count_cache[6+8*0]=
6579             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6580         }
6581         if(IS_8x8DCT(left_type[0])){
6582             h->non_zero_count_cache[3+8*1]=
6583             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6584         }
6585         if(IS_8x8DCT(left_type[1])){
6586             h->non_zero_count_cache[3+8*3]=
6587             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6588         }
6589
6590         if(IS_8x8DCT(mb_type)){
6591             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6592             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6593
6594             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6595             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6596
6597             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6598             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6599
6600             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6601             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6602         }
6603     }
6604
6605     if (FRAME_MBAFF
6606             // left mb is in picture
6607             && h->slice_table[mb_xy-1] != 0xFFFF
6608             // and current and left pair do not have the same interlaced type
6609             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6610             // and left mb is in the same slice if deblocking_filter == 2
6611             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6612         /* First vertical edge is different in MBAFF frames
6613          * There are 8 different bS to compute and 2 different Qp
6614          */
6615         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6616         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6617         int16_t bS[8];
6618         int qp[2];
6619         int bqp[2];
6620         int rqp[2];
6621         int mb_qp, mbn0_qp, mbn1_qp;
6622         int i;
6623         first_vertical_edge_done = 1;
6624
6625         if( IS_INTRA(mb_type) )
6626             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6627         else {
6628             for( i = 0; i < 8; i++ ) {
6629                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6630
6631                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6632                     bS[i] = 4;
6633                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6634                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6635                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6636                                                                        :
6637                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6638                     bS[i] = 2;
6639                 else
6640                     bS[i] = 1;
6641             }
6642         }
6643
6644         mb_qp = s->current_picture.qscale_table[mb_xy];
6645         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6646         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6647         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6648         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6649                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6650         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6651                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6652         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6653         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6654                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6655         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6656                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6657
6658         /* Filter edge */
6659         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6660         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6661         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6662         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6663         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6664     }
6665
6666 #if CONFIG_SMALL
6667     for( dir = 0; dir < 2; dir++ )
6668         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6669 #else
6670     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6671     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6672 #endif
6673 }
6674
6675 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6676     H264Context *h = *(void**)arg;
6677     MpegEncContext * const s = &h->s;
6678     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6679
6680     s->mb_skip_run= -1;
6681
6682     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6683                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6684
6685     if( h->pps.cabac ) {
6686         int i;
6687
6688         /* realign */
6689         align_get_bits( &s->gb );
6690
6691         /* init cabac */
6692         ff_init_cabac_states( &h->cabac);
6693         ff_init_cabac_decoder( &h->cabac,
6694                                s->gb.buffer + get_bits_count(&s->gb)/8,
6695                                (get_bits_left(&s->gb) + 7)/8);
6696         /* calculate pre-state */
6697         for( i= 0; i < 460; i++ ) {
6698             int pre;
6699             if( h->slice_type_nos == FF_I_TYPE )
6700                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6701             else
6702                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6703
6704             if( pre <= 63 )
6705                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6706             else
6707                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6708         }
6709
6710         for(;;){
6711 //START_TIMER
6712             int ret = decode_mb_cabac(h);
6713             int eos;
6714 //STOP_TIMER("decode_mb_cabac")
6715
6716             if(ret>=0) hl_decode_mb(h);
6717
6718             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6719                 s->mb_y++;
6720
6721                 ret = decode_mb_cabac(h);
6722
6723                 if(ret>=0) hl_decode_mb(h);
6724                 s->mb_y--;
6725             }
6726             eos = get_cabac_terminate( &h->cabac );
6727
6728             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6729                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6730                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6731                 return -1;
6732             }
6733
6734             if( ++s->mb_x >= s->mb_width ) {
6735                 s->mb_x = 0;
6736                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6737                 ++s->mb_y;
6738                 if(FIELD_OR_MBAFF_PICTURE) {
6739                     ++s->mb_y;
6740                 }
6741             }
6742
6743             if( eos || s->mb_y >= s->mb_height ) {
6744                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6745                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6746                 return 0;
6747             }
6748         }
6749
6750     } else {
6751         for(;;){
6752             int ret = decode_mb_cavlc(h);
6753
6754             if(ret>=0) hl_decode_mb(h);
6755
6756             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6757                 s->mb_y++;
6758                 ret = decode_mb_cavlc(h);
6759
6760                 if(ret>=0) hl_decode_mb(h);
6761                 s->mb_y--;
6762             }
6763
6764             if(ret<0){
6765                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6766                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6767
6768                 return -1;
6769             }
6770
6771             if(++s->mb_x >= s->mb_width){
6772                 s->mb_x=0;
6773                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6774                 ++s->mb_y;
6775                 if(FIELD_OR_MBAFF_PICTURE) {
6776                     ++s->mb_y;
6777                 }
6778                 if(s->mb_y >= s->mb_height){
6779                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6780
6781                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6782                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6783
6784                         return 0;
6785                     }else{
6786                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6787
6788                         return -1;
6789                     }
6790                 }
6791             }
6792
6793             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6794                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6795                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6796                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6797
6798                     return 0;
6799                 }else{
6800                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6801
6802                     return -1;
6803                 }
6804             }
6805         }
6806     }
6807
6808 #if 0
6809     for(;s->mb_y < s->mb_height; s->mb_y++){
6810         for(;s->mb_x < s->mb_width; s->mb_x++){
6811             int ret= decode_mb(h);
6812
6813             hl_decode_mb(h);
6814
6815             if(ret<0){
6816                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6817                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6818
6819                 return -1;
6820             }
6821
6822             if(++s->mb_x >= s->mb_width){
6823                 s->mb_x=0;
6824                 if(++s->mb_y >= s->mb_height){
6825                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6826                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6827
6828                         return 0;
6829                     }else{
6830                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6831
6832                         return -1;
6833                     }
6834                 }
6835             }
6836
6837             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6838                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6839                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6840
6841                     return 0;
6842                 }else{
6843                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6844
6845                     return -1;
6846                 }
6847             }
6848         }
6849         s->mb_x=0;
6850         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6851     }
6852 #endif
6853     return -1; //not reached
6854 }
6855
6856 static int decode_picture_timing(H264Context *h){
6857     MpegEncContext * const s = &h->s;
6858     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6859         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6860         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6861     }
6862     if(h->sps.pic_struct_present_flag){
6863         unsigned int i, num_clock_ts;
6864         h->sei_pic_struct = get_bits(&s->gb, 4);
6865         h->sei_ct_type    = 0;
6866
6867         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6868             return -1;
6869
6870         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6871
6872         for (i = 0 ; i < num_clock_ts ; i++){
6873             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6874                 unsigned int full_timestamp_flag;
6875                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6876                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6877                 skip_bits(&s->gb, 5);                 /* counting_type */
6878                 full_timestamp_flag = get_bits(&s->gb, 1);
6879                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6880                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6881                 skip_bits(&s->gb, 8);                 /* n_frames */
6882                 if(full_timestamp_flag){
6883                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6884                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6885                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6886                 }else{
6887                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6888                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6889                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6890                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6891                             if(get_bits(&s->gb, 1))   /* hours_flag */
6892                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6893                         }
6894                     }
6895                 }
6896                 if(h->sps.time_offset_length > 0)
6897                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6898             }
6899         }
6900
6901         if(s->avctx->debug & FF_DEBUG_PICT_INFO)
6902             av_log(s->avctx, AV_LOG_DEBUG, "ct_type:%X pic_struct:%d\n", h->sei_ct_type, h->sei_pic_struct);
6903     }
6904     return 0;
6905 }
6906
6907 static int decode_unregistered_user_data(H264Context *h, int size){
6908     MpegEncContext * const s = &h->s;
6909     uint8_t user_data[16+256];
6910     int e, build, i;
6911
6912     if(size<16)
6913         return -1;
6914
6915     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6916         user_data[i]= get_bits(&s->gb, 8);
6917     }
6918
6919     user_data[i]= 0;
6920     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6921     if(e==1 && build>=0)
6922         h->x264_build= build;
6923
6924     if(s->avctx->debug & FF_DEBUG_BUGS)
6925         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6926
6927     for(; i<size; i++)
6928         skip_bits(&s->gb, 8);
6929
6930     return 0;
6931 }
6932
6933 static int decode_recovery_point(H264Context *h){
6934     MpegEncContext * const s = &h->s;
6935
6936     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6937     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6938
6939     return 0;
6940 }
6941
6942 static int decode_buffering_period(H264Context *h){
6943     MpegEncContext * const s = &h->s;
6944     unsigned int sps_id;
6945     int sched_sel_idx;
6946     SPS *sps;
6947
6948     sps_id = get_ue_golomb_31(&s->gb);
6949     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6950         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6951         return -1;
6952     }
6953     sps = h->sps_buffers[sps_id];
6954
6955     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6956     if (sps->nal_hrd_parameters_present_flag) {
6957         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6958             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6959             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6960         }
6961     }
6962     if (sps->vcl_hrd_parameters_present_flag) {
6963         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6964             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6965             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6966         }
6967     }
6968
6969     h->sei_buffering_period_present = 1;
6970     return 0;
6971 }
6972
6973 int ff_h264_decode_sei(H264Context *h){
6974     MpegEncContext * const s = &h->s;
6975
6976     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6977         int size, type;
6978
6979         type=0;
6980         do{
6981             type+= show_bits(&s->gb, 8);
6982         }while(get_bits(&s->gb, 8) == 255);
6983
6984         size=0;
6985         do{
6986             size+= show_bits(&s->gb, 8);
6987         }while(get_bits(&s->gb, 8) == 255);
6988
6989         switch(type){
6990         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6991             if(decode_picture_timing(h) < 0)
6992                 return -1;
6993             break;
6994         case SEI_TYPE_USER_DATA_UNREGISTERED:
6995             if(decode_unregistered_user_data(h, size) < 0)
6996                 return -1;
6997             break;
6998         case SEI_TYPE_RECOVERY_POINT:
6999             if(decode_recovery_point(h) < 0)
7000                 return -1;
7001             break;
7002         case SEI_BUFFERING_PERIOD:
7003             if(decode_buffering_period(h) < 0)
7004                 return -1;
7005             break;
7006         default:
7007             skip_bits(&s->gb, 8*size);
7008         }
7009
7010         //FIXME check bits here
7011         align_get_bits(&s->gb);
7012     }
7013
7014     return 0;
7015 }
7016
7017 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7018     MpegEncContext * const s = &h->s;
7019     int cpb_count, i;
7020     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7021
7022     if(cpb_count > 32U){
7023         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7024         return -1;
7025     }
7026
7027     get_bits(&s->gb, 4); /* bit_rate_scale */
7028     get_bits(&s->gb, 4); /* cpb_size_scale */
7029     for(i=0; i<cpb_count; i++){
7030         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7031         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7032         get_bits1(&s->gb);     /* cbr_flag */
7033     }
7034     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7035     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7036     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7037     sps->time_offset_length = get_bits(&s->gb, 5);
7038     sps->cpb_cnt = cpb_count;
7039     return 0;
7040 }
7041
7042 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7043     MpegEncContext * const s = &h->s;
7044     int aspect_ratio_info_present_flag;
7045     unsigned int aspect_ratio_idc;
7046
7047     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7048
7049     if( aspect_ratio_info_present_flag ) {
7050         aspect_ratio_idc= get_bits(&s->gb, 8);
7051         if( aspect_ratio_idc == EXTENDED_SAR ) {
7052             sps->sar.num= get_bits(&s->gb, 16);
7053             sps->sar.den= get_bits(&s->gb, 16);
7054         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7055             sps->sar=  pixel_aspect[aspect_ratio_idc];
7056         }else{
7057             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7058             return -1;
7059         }
7060     }else{
7061         sps->sar.num=
7062         sps->sar.den= 0;
7063     }
7064 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7065
7066     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7067         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7068     }
7069
7070     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7071         get_bits(&s->gb, 3);    /* video_format */
7072         get_bits1(&s->gb);      /* video_full_range_flag */
7073         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7074             get_bits(&s->gb, 8); /* colour_primaries */
7075             get_bits(&s->gb, 8); /* transfer_characteristics */
7076             get_bits(&s->gb, 8); /* matrix_coefficients */
7077         }
7078     }
7079
7080     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7081         s->avctx->chroma_sample_location = get_ue_golomb(&s->gb)+1;  /* chroma_sample_location_type_top_field */
7082         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7083     }
7084
7085     sps->timing_info_present_flag = get_bits1(&s->gb);
7086     if(sps->timing_info_present_flag){
7087         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7088         sps->time_scale = get_bits_long(&s->gb, 32);
7089         if(sps->num_units_in_tick-1 > 0x7FFFFFFEU || sps->time_scale-1 > 0x7FFFFFFEU){
7090             av_log(h->s.avctx, AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
7091             return -1;
7092         }
7093         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7094     }
7095
7096     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7097     if(sps->nal_hrd_parameters_present_flag)
7098         if(decode_hrd_parameters(h, sps) < 0)
7099             return -1;
7100     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7101     if(sps->vcl_hrd_parameters_present_flag)
7102         if(decode_hrd_parameters(h, sps) < 0)
7103             return -1;
7104     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7105         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7106     sps->pic_struct_present_flag = get_bits1(&s->gb);
7107
7108     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7109     if(sps->bitstream_restriction_flag){
7110         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7111         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7112         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7113         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7114         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7115         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7116         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7117
7118         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7119             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7120             return -1;
7121         }
7122     }
7123
7124     return 0;
7125 }
7126
7127 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7128                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7129     MpegEncContext * const s = &h->s;
7130     int i, last = 8, next = 8;
7131     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7132     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7133         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7134     else
7135     for(i=0;i<size;i++){
7136         if(next)
7137             next = (last + get_se_golomb(&s->gb)) & 0xff;
7138         if(!i && !next){ /* matrix not written, we use the preset one */
7139             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7140             break;
7141         }
7142         last = factors[scan[i]] = next ? next : last;
7143     }
7144 }
7145
7146 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7147                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7148     MpegEncContext * const s = &h->s;
7149     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7150     const uint8_t *fallback[4] = {
7151         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7152         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7153         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7154         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7155     };
7156     if(get_bits1(&s->gb)){
7157         sps->scaling_matrix_present |= is_sps;
7158         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7159         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7160         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7161         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7162         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7163         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7164         if(is_sps || pps->transform_8x8_mode){
7165             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7166             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7167         }
7168     }
7169 }
7170
7171 int ff_h264_decode_seq_parameter_set(H264Context *h){
7172     MpegEncContext * const s = &h->s;
7173     int profile_idc, level_idc;
7174     unsigned int sps_id;
7175     int i;
7176     SPS *sps;
7177
7178     profile_idc= get_bits(&s->gb, 8);
7179     get_bits1(&s->gb);   //constraint_set0_flag
7180     get_bits1(&s->gb);   //constraint_set1_flag
7181     get_bits1(&s->gb);   //constraint_set2_flag
7182     get_bits1(&s->gb);   //constraint_set3_flag
7183     get_bits(&s->gb, 4); // reserved
7184     level_idc= get_bits(&s->gb, 8);
7185     sps_id= get_ue_golomb_31(&s->gb);
7186
7187     if(sps_id >= MAX_SPS_COUNT) {
7188         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7189         return -1;
7190     }
7191     sps= av_mallocz(sizeof(SPS));
7192     if(sps == NULL)
7193         return -1;
7194
7195     sps->profile_idc= profile_idc;
7196     sps->level_idc= level_idc;
7197
7198     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7199     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7200     sps->scaling_matrix_present = 0;
7201
7202     if(sps->profile_idc >= 100){ //high profile
7203         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7204         if(sps->chroma_format_idc == 3)
7205             sps->residual_color_transform_flag = get_bits1(&s->gb);
7206         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7207         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7208         sps->transform_bypass = get_bits1(&s->gb);
7209         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7210     }else{
7211         sps->chroma_format_idc= 1;
7212     }
7213
7214     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7215     sps->poc_type= get_ue_golomb_31(&s->gb);
7216
7217     if(sps->poc_type == 0){ //FIXME #define
7218         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7219     } else if(sps->poc_type == 1){//FIXME #define
7220         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7221         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7222         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7223         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7224
7225         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7226             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7227             goto fail;
7228         }
7229
7230         for(i=0; i<sps->poc_cycle_length; i++)
7231             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7232     }else if(sps->poc_type != 2){
7233         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7234         goto fail;
7235     }
7236
7237     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7238     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7239         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7240         goto fail;
7241     }
7242     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7243     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7244     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7245     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7246        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7247         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7248         goto fail;
7249     }
7250
7251     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7252     if(!sps->frame_mbs_only_flag)
7253         sps->mb_aff= get_bits1(&s->gb);
7254     else
7255         sps->mb_aff= 0;
7256
7257     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7258
7259 #ifndef ALLOW_INTERLACE
7260     if(sps->mb_aff)
7261         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7262 #endif
7263     sps->crop= get_bits1(&s->gb);
7264     if(sps->crop){
7265         sps->crop_left  = get_ue_golomb(&s->gb);
7266         sps->crop_right = get_ue_golomb(&s->gb);
7267         sps->crop_top   = get_ue_golomb(&s->gb);
7268         sps->crop_bottom= get_ue_golomb(&s->gb);
7269         if(sps->crop_left || sps->crop_top){
7270             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7271         }
7272         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7273             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7274         }
7275     }else{
7276         sps->crop_left  =
7277         sps->crop_right =
7278         sps->crop_top   =
7279         sps->crop_bottom= 0;
7280     }
7281
7282     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7283     if( sps->vui_parameters_present_flag )
7284         if (decode_vui_parameters(h, sps) < 0)
7285             goto fail;
7286
7287     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7288         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7289                sps_id, sps->profile_idc, sps->level_idc,
7290                sps->poc_type,
7291                sps->ref_frame_count,
7292                sps->mb_width, sps->mb_height,
7293                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7294                sps->direct_8x8_inference_flag ? "8B8" : "",
7295                sps->crop_left, sps->crop_right,
7296                sps->crop_top, sps->crop_bottom,
7297                sps->vui_parameters_present_flag ? "VUI" : "",
7298                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7299                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7300                sps->timing_info_present_flag ? sps->time_scale : 0
7301                );
7302     }
7303
7304     av_free(h->sps_buffers[sps_id]);
7305     h->sps_buffers[sps_id]= sps;
7306     h->sps = *sps;
7307     return 0;
7308 fail:
7309     av_free(sps);
7310     return -1;
7311 }
7312
7313 static void
7314 build_qp_table(PPS *pps, int t, int index)
7315 {
7316     int i;
7317     for(i = 0; i < 52; i++)
7318         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7319 }
7320
7321 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7322     MpegEncContext * const s = &h->s;
7323     unsigned int pps_id= get_ue_golomb(&s->gb);
7324     PPS *pps;
7325
7326     if(pps_id >= MAX_PPS_COUNT) {
7327         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7328         return -1;
7329     }
7330
7331     pps= av_mallocz(sizeof(PPS));
7332     if(pps == NULL)
7333         return -1;
7334     pps->sps_id= get_ue_golomb_31(&s->gb);
7335     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7336         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7337         goto fail;
7338     }
7339
7340     pps->cabac= get_bits1(&s->gb);
7341     pps->pic_order_present= get_bits1(&s->gb);
7342     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7343     if(pps->slice_group_count > 1 ){
7344         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7345         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7346         switch(pps->mb_slice_group_map_type){
7347         case 0:
7348 #if 0
7349 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7350 |    run_length[ i ]                                |1  |ue(v)   |
7351 #endif
7352             break;
7353         case 2:
7354 #if 0
7355 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7356 |{                                                  |   |        |
7357 |    top_left_mb[ i ]                               |1  |ue(v)   |
7358 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7359 |   }                                               |   |        |
7360 #endif
7361             break;
7362         case 3:
7363         case 4:
7364         case 5:
7365 #if 0
7366 |   slice_group_change_direction_flag               |1  |u(1)    |
7367 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7368 #endif
7369             break;
7370         case 6:
7371 #if 0
7372 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7373 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7374 |)                                                  |   |        |
7375 |    slice_group_id[ i ]                            |1  |u(v)    |
7376 #endif
7377             break;
7378         }
7379     }
7380     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7381     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7382     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7383         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7384         goto fail;
7385     }
7386
7387     pps->weighted_pred= get_bits1(&s->gb);
7388     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7389     pps->init_qp= get_se_golomb(&s->gb) + 26;
7390     pps->init_qs= get_se_golomb(&s->gb) + 26;
7391     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7392     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7393     pps->constrained_intra_pred= get_bits1(&s->gb);
7394     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7395
7396     pps->transform_8x8_mode= 0;
7397     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7398     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7399     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7400
7401     if(get_bits_count(&s->gb) < bit_length){
7402         pps->transform_8x8_mode= get_bits1(&s->gb);
7403         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7404         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7405     } else {
7406         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7407     }
7408
7409     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7410     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7411     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7412         h->pps.chroma_qp_diff= 1;
7413
7414     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7415         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7416                pps_id, pps->sps_id,
7417                pps->cabac ? "CABAC" : "CAVLC",
7418                pps->slice_group_count,
7419                pps->ref_count[0], pps->ref_count[1],
7420                pps->weighted_pred ? "weighted" : "",
7421                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7422                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7423                pps->constrained_intra_pred ? "CONSTR" : "",
7424                pps->redundant_pic_cnt_present ? "REDU" : "",
7425                pps->transform_8x8_mode ? "8x8DCT" : ""
7426                );
7427     }
7428
7429     av_free(h->pps_buffers[pps_id]);
7430     h->pps_buffers[pps_id]= pps;
7431     return 0;
7432 fail:
7433     av_free(pps);
7434     return -1;
7435 }
7436
7437 /**
7438  * Call decode_slice() for each context.
7439  *
7440  * @param h h264 master context
7441  * @param context_count number of contexts to execute
7442  */
7443 static void execute_decode_slices(H264Context *h, int context_count){
7444     MpegEncContext * const s = &h->s;
7445     AVCodecContext * const avctx= s->avctx;
7446     H264Context *hx;
7447     int i;
7448
7449     if (s->avctx->hwaccel)
7450         return;
7451     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7452         return;
7453     if(context_count == 1) {
7454         decode_slice(avctx, &h);
7455     } else {
7456         for(i = 1; i < context_count; i++) {
7457             hx = h->thread_context[i];
7458             hx->s.error_recognition = avctx->error_recognition;
7459             hx->s.error_count = 0;
7460         }
7461
7462         avctx->execute(avctx, (void *)decode_slice,
7463                        h->thread_context, NULL, context_count, sizeof(void*));
7464
7465         /* pull back stuff from slices to master context */
7466         hx = h->thread_context[context_count - 1];
7467         s->mb_x = hx->s.mb_x;
7468         s->mb_y = hx->s.mb_y;
7469         s->dropable = hx->s.dropable;
7470         s->picture_structure = hx->s.picture_structure;
7471         for(i = 1; i < context_count; i++)
7472             h->s.error_count += h->thread_context[i]->s.error_count;
7473     }
7474 }
7475
7476
7477 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7478     MpegEncContext * const s = &h->s;
7479     AVCodecContext * const avctx= s->avctx;
7480     int buf_index=0;
7481     H264Context *hx; ///< thread context
7482     int context_count = 0;
7483     int next_avc= h->is_avc ? 0 : buf_size;
7484
7485     h->max_contexts = avctx->thread_count;
7486 #if 0
7487     int i;
7488     for(i=0; i<50; i++){
7489         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7490     }
7491 #endif
7492     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7493         h->current_slice = 0;
7494         if (!s->first_field)
7495             s->current_picture_ptr= NULL;
7496         reset_sei(h);
7497     }
7498
7499     for(;;){
7500         int consumed;
7501         int dst_length;
7502         int bit_length;
7503         const uint8_t *ptr;
7504         int i, nalsize = 0;
7505         int err;
7506
7507         if(buf_index >= next_avc) {
7508             if(buf_index >= buf_size) break;
7509             nalsize = 0;
7510             for(i = 0; i < h->nal_length_size; i++)
7511                 nalsize = (nalsize << 8) | buf[buf_index++];
7512             if(nalsize <= 1 || nalsize > buf_size - buf_index){
7513                 if(nalsize == 1){
7514                     buf_index++;
7515                     continue;
7516                 }else{
7517                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7518                     break;
7519                 }
7520             }
7521             next_avc= buf_index + nalsize;
7522         } else {
7523             // start code prefix search
7524             for(; buf_index + 3 < next_avc; buf_index++){
7525                 // This should always succeed in the first iteration.
7526                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7527                     break;
7528             }
7529
7530             if(buf_index+3 >= buf_size) break;
7531
7532             buf_index+=3;
7533             if(buf_index >= next_avc) continue;
7534         }
7535
7536         hx = h->thread_context[context_count];
7537
7538         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7539         if (ptr==NULL || dst_length < 0){
7540             return -1;
7541         }
7542         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7543             dst_length--;
7544         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7545
7546         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7547             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7548         }
7549
7550         if (h->is_avc && (nalsize != consumed) && nalsize){
7551             av_log(h->s.avctx, AV_LOG_DEBUG, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7552         }
7553
7554         buf_index += consumed;
7555
7556         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7557            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7558             continue;
7559
7560       again:
7561         err = 0;
7562         switch(hx->nal_unit_type){
7563         case NAL_IDR_SLICE:
7564             if (h->nal_unit_type != NAL_IDR_SLICE) {
7565                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7566                 return -1;
7567             }
7568             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7569         case NAL_SLICE:
7570             init_get_bits(&hx->s.gb, ptr, bit_length);
7571             hx->intra_gb_ptr=
7572             hx->inter_gb_ptr= &hx->s.gb;
7573             hx->s.data_partitioning = 0;
7574
7575             if((err = decode_slice_header(hx, h)))
7576                break;
7577
7578             if (s->avctx->hwaccel && h->current_slice == 1) {
7579                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7580                     return -1;
7581             }
7582
7583             s->current_picture_ptr->key_frame |=
7584                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7585                     (h->sei_recovery_frame_cnt >= 0);
7586             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7587                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7588                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7589                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7590                && avctx->skip_frame < AVDISCARD_ALL){
7591                 if(avctx->hwaccel) {
7592                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7593                         return -1;
7594                 }else
7595                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7596                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7597                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7598                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7599                 }else
7600                     context_count++;
7601             }
7602             break;
7603         case NAL_DPA:
7604             init_get_bits(&hx->s.gb, ptr, bit_length);
7605             hx->intra_gb_ptr=
7606             hx->inter_gb_ptr= NULL;
7607
7608             if ((err = decode_slice_header(hx, h)) < 0)
7609                 break;
7610
7611             hx->s.data_partitioning = 1;
7612
7613             break;
7614         case NAL_DPB:
7615             init_get_bits(&hx->intra_gb, ptr, bit_length);
7616             hx->intra_gb_ptr= &hx->intra_gb;
7617             break;
7618         case NAL_DPC:
7619             init_get_bits(&hx->inter_gb, ptr, bit_length);
7620             hx->inter_gb_ptr= &hx->inter_gb;
7621
7622             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7623                && s->context_initialized
7624                && s->hurry_up < 5
7625                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7626                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7627                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7628                && avctx->skip_frame < AVDISCARD_ALL)
7629                 context_count++;
7630             break;
7631         case NAL_SEI:
7632             init_get_bits(&s->gb, ptr, bit_length);
7633             ff_h264_decode_sei(h);
7634             break;
7635         case NAL_SPS:
7636             init_get_bits(&s->gb, ptr, bit_length);
7637             ff_h264_decode_seq_parameter_set(h);
7638
7639             if(s->flags& CODEC_FLAG_LOW_DELAY)
7640                 s->low_delay=1;
7641
7642             if(avctx->has_b_frames < 2)
7643                 avctx->has_b_frames= !s->low_delay;
7644             break;
7645         case NAL_PPS:
7646             init_get_bits(&s->gb, ptr, bit_length);
7647
7648             ff_h264_decode_picture_parameter_set(h, bit_length);
7649
7650             break;
7651         case NAL_AUD:
7652         case NAL_END_SEQUENCE:
7653         case NAL_END_STREAM:
7654         case NAL_FILLER_DATA:
7655         case NAL_SPS_EXT:
7656         case NAL_AUXILIARY_SLICE:
7657             break;
7658         default:
7659             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7660         }
7661
7662         if(context_count == h->max_contexts) {
7663             execute_decode_slices(h, context_count);
7664             context_count = 0;
7665         }
7666
7667         if (err < 0)
7668             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7669         else if(err == 1) {
7670             /* Slice could not be decoded in parallel mode, copy down
7671              * NAL unit stuff to context 0 and restart. Note that
7672              * rbsp_buffer is not transferred, but since we no longer
7673              * run in parallel mode this should not be an issue. */
7674             h->nal_unit_type = hx->nal_unit_type;
7675             h->nal_ref_idc   = hx->nal_ref_idc;
7676             hx = h;
7677             goto again;
7678         }
7679     }
7680     if(context_count)
7681         execute_decode_slices(h, context_count);
7682     return buf_index;
7683 }
7684
7685 /**
7686  * returns the number of bytes consumed for building the current frame
7687  */
7688 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7689         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7690         if(pos+10>buf_size) pos=buf_size; // oops ;)
7691
7692         return pos;
7693 }
7694
7695 static int decode_frame(AVCodecContext *avctx,
7696                              void *data, int *data_size,
7697                              AVPacket *avpkt)
7698 {
7699     const uint8_t *buf = avpkt->data;
7700     int buf_size = avpkt->size;
7701     H264Context *h = avctx->priv_data;
7702     MpegEncContext *s = &h->s;
7703     AVFrame *pict = data;
7704     int buf_index;
7705
7706     s->flags= avctx->flags;
7707     s->flags2= avctx->flags2;
7708
7709    /* end of stream, output what is still in the buffers */
7710     if (buf_size == 0) {
7711         Picture *out;
7712         int i, out_idx;
7713
7714 //FIXME factorize this with the output code below
7715         out = h->delayed_pic[0];
7716         out_idx = 0;
7717         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7718             if(h->delayed_pic[i]->poc < out->poc){
7719                 out = h->delayed_pic[i];
7720                 out_idx = i;
7721             }
7722
7723         for(i=out_idx; h->delayed_pic[i]; i++)
7724             h->delayed_pic[i] = h->delayed_pic[i+1];
7725
7726         if(out){
7727             *data_size = sizeof(AVFrame);
7728             *pict= *(AVFrame*)out;
7729         }
7730
7731         return 0;
7732     }
7733
7734     if(h->is_avc && !h->got_avcC) {
7735         int i, cnt, nalsize;
7736         unsigned char *p = avctx->extradata;
7737         if(avctx->extradata_size < 7) {
7738             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7739             return -1;
7740         }
7741         if(*p != 1) {
7742             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7743             return -1;
7744         }
7745         /* sps and pps in the avcC always have length coded with 2 bytes,
7746            so put a fake nal_length_size = 2 while parsing them */
7747         h->nal_length_size = 2;
7748         // Decode sps from avcC
7749         cnt = *(p+5) & 0x1f; // Number of sps
7750         p += 6;
7751         for (i = 0; i < cnt; i++) {
7752             nalsize = AV_RB16(p) + 2;
7753             if(decode_nal_units(h, p, nalsize) < 0) {
7754                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7755                 return -1;
7756             }
7757             p += nalsize;
7758         }
7759         // Decode pps from avcC
7760         cnt = *(p++); // Number of pps
7761         for (i = 0; i < cnt; i++) {
7762             nalsize = AV_RB16(p) + 2;
7763             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7764                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7765                 return -1;
7766             }
7767             p += nalsize;
7768         }
7769         // Now store right nal length size, that will be use to parse all other nals
7770         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7771         // Do not reparse avcC
7772         h->got_avcC = 1;
7773     }
7774
7775     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7776         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7777             return -1;
7778         h->got_avcC = 1;
7779     }
7780
7781     buf_index=decode_nal_units(h, buf, buf_size);
7782     if(buf_index < 0)
7783         return -1;
7784
7785     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7786         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7787         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7788         return -1;
7789     }
7790
7791     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7792         Picture *out = s->current_picture_ptr;
7793         Picture *cur = s->current_picture_ptr;
7794         int i, pics, out_of_order, out_idx;
7795
7796         field_end(h);
7797
7798         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7799             /* Wait for second field. */
7800             *data_size = 0;
7801
7802         } else {
7803             cur->interlaced_frame = 0;
7804             cur->repeat_pict = 0;
7805
7806             /* Signal interlacing information externally. */
7807             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7808
7809             if(h->sps.pic_struct_present_flag){
7810                 switch (h->sei_pic_struct)
7811                 {
7812                 case SEI_PIC_STRUCT_FRAME:
7813                     break;
7814                 case SEI_PIC_STRUCT_TOP_FIELD:
7815                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7816                     cur->interlaced_frame = 1;
7817                     break;
7818                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7819                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7820                     if (FIELD_OR_MBAFF_PICTURE)
7821                         cur->interlaced_frame = 1;
7822                     else
7823                         // try to flag soft telecine progressive
7824                         cur->interlaced_frame = h->prev_interlaced_frame;
7825                     break;
7826                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7827                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7828                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7829                     // From these hints, let the applications decide if they apply deinterlacing.
7830                     cur->repeat_pict = 1;
7831                     break;
7832                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7833                     // Force progressive here, as doubling interlaced frame is a bad idea.
7834                     cur->repeat_pict = 2;
7835                     break;
7836                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7837                     cur->repeat_pict = 4;
7838                     break;
7839                 }
7840
7841                 if ((h->sei_ct_type & 3) && h->sei_pic_struct <= SEI_PIC_STRUCT_BOTTOM_TOP)
7842                     cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7843             }else{
7844                 /* Derive interlacing flag from used decoding process. */
7845                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7846             }
7847             h->prev_interlaced_frame = cur->interlaced_frame;
7848
7849             if (cur->field_poc[0] != cur->field_poc[1]){
7850                 /* Derive top_field_first from field pocs. */
7851                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7852             }else{
7853                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7854                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7855                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7856                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7857                         cur->top_field_first = 1;
7858                     else
7859                         cur->top_field_first = 0;
7860                 }else{
7861                     /* Most likely progressive */
7862                     cur->top_field_first = 0;
7863                 }
7864             }
7865
7866         //FIXME do something with unavailable reference frames
7867
7868             /* Sort B-frames into display order */
7869
7870             if(h->sps.bitstream_restriction_flag
7871                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7872                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7873                 s->low_delay = 0;
7874             }
7875
7876             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7877                && !h->sps.bitstream_restriction_flag){
7878                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7879                 s->low_delay= 0;
7880             }
7881
7882             pics = 0;
7883             while(h->delayed_pic[pics]) pics++;
7884
7885             assert(pics <= MAX_DELAYED_PIC_COUNT);
7886
7887             h->delayed_pic[pics++] = cur;
7888             if(cur->reference == 0)
7889                 cur->reference = DELAYED_PIC_REF;
7890
7891             out = h->delayed_pic[0];
7892             out_idx = 0;
7893             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7894                 if(h->delayed_pic[i]->poc < out->poc){
7895                     out = h->delayed_pic[i];
7896                     out_idx = i;
7897                 }
7898             if(s->avctx->has_b_frames == 0 && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset))
7899                 h->outputed_poc= INT_MIN;
7900             out_of_order = out->poc < h->outputed_poc;
7901
7902             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7903                 { }
7904             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7905                || (s->low_delay &&
7906                 ((h->outputed_poc != INT_MIN && out->poc > h->outputed_poc + 2)
7907                  || cur->pict_type == FF_B_TYPE)))
7908             {
7909                 s->low_delay = 0;
7910                 s->avctx->has_b_frames++;
7911             }
7912
7913             if(out_of_order || pics > s->avctx->has_b_frames){
7914                 out->reference &= ~DELAYED_PIC_REF;
7915                 for(i=out_idx; h->delayed_pic[i]; i++)
7916                     h->delayed_pic[i] = h->delayed_pic[i+1];
7917             }
7918             if(!out_of_order && pics > s->avctx->has_b_frames){
7919                 *data_size = sizeof(AVFrame);
7920
7921                 if(out_idx==0 && h->delayed_pic[0] && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset)) {
7922                     h->outputed_poc = INT_MIN;
7923                 } else
7924                     h->outputed_poc = out->poc;
7925                 *pict= *(AVFrame*)out;
7926             }else{
7927                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7928             }
7929         }
7930     }
7931
7932     assert(pict->data[0] || !*data_size);
7933     ff_print_debug_info(s, pict);
7934 //printf("out %d\n", (int)pict->data[0]);
7935
7936     return get_consumed_bytes(s, buf_index, buf_size);
7937 }
7938 #if 0
7939 static inline void fill_mb_avail(H264Context *h){
7940     MpegEncContext * const s = &h->s;
7941     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7942
7943     if(s->mb_y){
7944         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7945         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7946         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7947     }else{
7948         h->mb_avail[0]=
7949         h->mb_avail[1]=
7950         h->mb_avail[2]= 0;
7951     }
7952     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7953     h->mb_avail[4]= 1; //FIXME move out
7954     h->mb_avail[5]= 0; //FIXME move out
7955 }
7956 #endif
7957
7958 #ifdef TEST
7959 #undef printf
7960 #undef random
7961 #define COUNT 8000
7962 #define SIZE (COUNT*40)
7963 int main(void){
7964     int i;
7965     uint8_t temp[SIZE];
7966     PutBitContext pb;
7967     GetBitContext gb;
7968 //    int int_temp[10000];
7969     DSPContext dsp;
7970     AVCodecContext avctx;
7971
7972     dsputil_init(&dsp, &avctx);
7973
7974     init_put_bits(&pb, temp, SIZE);
7975     printf("testing unsigned exp golomb\n");
7976     for(i=0; i<COUNT; i++){
7977         START_TIMER
7978         set_ue_golomb(&pb, i);
7979         STOP_TIMER("set_ue_golomb");
7980     }
7981     flush_put_bits(&pb);
7982
7983     init_get_bits(&gb, temp, 8*SIZE);
7984     for(i=0; i<COUNT; i++){
7985         int j, s;
7986
7987         s= show_bits(&gb, 24);
7988
7989         START_TIMER
7990         j= get_ue_golomb(&gb);
7991         if(j != i){
7992             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7993 //            return -1;
7994         }
7995         STOP_TIMER("get_ue_golomb");
7996     }
7997
7998
7999     init_put_bits(&pb, temp, SIZE);
8000     printf("testing signed exp golomb\n");
8001     for(i=0; i<COUNT; i++){
8002         START_TIMER
8003         set_se_golomb(&pb, i - COUNT/2);
8004         STOP_TIMER("set_se_golomb");
8005     }
8006     flush_put_bits(&pb);
8007
8008     init_get_bits(&gb, temp, 8*SIZE);
8009     for(i=0; i<COUNT; i++){
8010         int j, s;
8011
8012         s= show_bits(&gb, 24);
8013
8014         START_TIMER
8015         j= get_se_golomb(&gb);
8016         if(j != i - COUNT/2){
8017             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8018 //            return -1;
8019         }
8020         STOP_TIMER("get_se_golomb");
8021     }
8022
8023 #if 0
8024     printf("testing 4x4 (I)DCT\n");
8025
8026     DCTELEM block[16];
8027     uint8_t src[16], ref[16];
8028     uint64_t error= 0, max_error=0;
8029
8030     for(i=0; i<COUNT; i++){
8031         int j;
8032 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8033         for(j=0; j<16; j++){
8034             ref[j]= random()%255;
8035             src[j]= random()%255;
8036         }
8037
8038         h264_diff_dct_c(block, src, ref, 4);
8039
8040         //normalize
8041         for(j=0; j<16; j++){
8042 //            printf("%d ", block[j]);
8043             block[j]= block[j]*4;
8044             if(j&1) block[j]= (block[j]*4 + 2)/5;
8045             if(j&4) block[j]= (block[j]*4 + 2)/5;
8046         }
8047 //        printf("\n");
8048
8049         s->dsp.h264_idct_add(ref, block, 4);
8050 /*        for(j=0; j<16; j++){
8051             printf("%d ", ref[j]);
8052         }
8053         printf("\n");*/
8054
8055         for(j=0; j<16; j++){
8056             int diff= FFABS(src[j] - ref[j]);
8057
8058             error+= diff*diff;
8059             max_error= FFMAX(max_error, diff);
8060         }
8061     }
8062     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8063     printf("testing quantizer\n");
8064     for(qp=0; qp<52; qp++){
8065         for(i=0; i<16; i++)
8066             src1_block[i]= src2_block[i]= random()%255;
8067
8068     }
8069     printf("Testing NAL layer\n");
8070
8071     uint8_t bitstream[COUNT];
8072     uint8_t nal[COUNT*2];
8073     H264Context h;
8074     memset(&h, 0, sizeof(H264Context));
8075
8076     for(i=0; i<COUNT; i++){
8077         int zeros= i;
8078         int nal_length;
8079         int consumed;
8080         int out_length;
8081         uint8_t *out;
8082         int j;
8083
8084         for(j=0; j<COUNT; j++){
8085             bitstream[j]= (random() % 255) + 1;
8086         }
8087
8088         for(j=0; j<zeros; j++){
8089             int pos= random() % COUNT;
8090             while(bitstream[pos] == 0){
8091                 pos++;
8092                 pos %= COUNT;
8093             }
8094             bitstream[pos]=0;
8095         }
8096
8097         START_TIMER
8098
8099         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8100         if(nal_length<0){
8101             printf("encoding failed\n");
8102             return -1;
8103         }
8104
8105         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8106
8107         STOP_TIMER("NAL")
8108
8109         if(out_length != COUNT){
8110             printf("incorrect length %d %d\n", out_length, COUNT);
8111             return -1;
8112         }
8113
8114         if(consumed != nal_length){
8115             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8116             return -1;
8117         }
8118
8119         if(memcmp(bitstream, out, COUNT)){
8120             printf("mismatch\n");
8121             return -1;
8122         }
8123     }
8124 #endif
8125
8126     printf("Testing RBSP\n");
8127
8128
8129     return 0;
8130 }
8131 #endif /* TEST */
8132
8133
8134 av_cold void ff_h264_free_context(H264Context *h)
8135 {
8136     int i;
8137
8138     free_tables(h); //FIXME cleanup init stuff perhaps
8139
8140     for(i = 0; i < MAX_SPS_COUNT; i++)
8141         av_freep(h->sps_buffers + i);
8142
8143     for(i = 0; i < MAX_PPS_COUNT; i++)
8144         av_freep(h->pps_buffers + i);
8145 }
8146
8147 static av_cold int decode_end(AVCodecContext *avctx)
8148 {
8149     H264Context *h = avctx->priv_data;
8150     MpegEncContext *s = &h->s;
8151
8152     ff_h264_free_context(h);
8153
8154     MPV_common_end(s);
8155
8156 //    memset(h, 0, sizeof(H264Context));
8157
8158     return 0;
8159 }
8160
8161
8162 AVCodec h264_decoder = {
8163     "h264",
8164     CODEC_TYPE_VIDEO,
8165     CODEC_ID_H264,
8166     sizeof(H264Context),
8167     decode_init,
8168     NULL,
8169     decode_end,
8170     decode_frame,
8171     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8172     .flush= flush_dpb,
8173     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8174     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8175 };
8176
8177 #if CONFIG_H264_VDPAU_DECODER
8178 AVCodec h264_vdpau_decoder = {
8179     "h264_vdpau",
8180     CODEC_TYPE_VIDEO,
8181     CODEC_ID_H264,
8182     sizeof(H264Context),
8183     decode_init,
8184     NULL,
8185     decode_end,
8186     decode_frame,
8187     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8188     .flush= flush_dpb,
8189     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8190     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_VDPAU_H264, PIX_FMT_NONE},
8191 };
8192 #endif
8193
8194 #if CONFIG_SVQ3_DECODER
8195 #include "svq3.c"
8196 #endif