git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 997
 998     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 999         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1000             int cur_poc = s->current_picture_ptr->poc;
1001             int *col_poc = h->ref_list[1]->field_poc;
1002             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1003             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1004             b8_stride = 0;
1005         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1006             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1007             mb_xy += s->mb_stride*fieldoff;
1008         }
1009         goto single_col;
1010     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1011         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1012             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1013             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1014             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1015             b8_stride *= 3;
1016             b4_stride *= 6;
1017             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1018             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1019                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1020                 && !is_b8x8){
1021                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1022                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1023             }else{
1024                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1025                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1026             }
1027         }else{                                           //     AFR/FR    -> AFR/FR
1028 single_col:
1029             mb_type_col[0] =
1030             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1031             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1032                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1033                 * so we know exactly what block size to use */
1034                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1035                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1036             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1037                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1038                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1039             }else{
1040                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }
1043         }
1044     }
1045
1046     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1047     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1048     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1049     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1050     if(!b8_stride){
1051         if(s->mb_y&1){
1052             l1ref0 += h->b8_stride;
1053             l1ref1 += h->b8_stride;
1054             l1mv0  +=  2*b4_stride;
1055             l1mv1  +=  2*b4_stride;
1056         }
1057     }
1058
1059     if(h->direct_spatial_mv_pred){
1060         int ref[2];
1061         int mv[2][2];
1062         int list;
1063
1064         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1065
1066         /* ref = min(neighbors) */
1067         for(list=0; list<2; list++){
1068             int refa = h->ref_cache[list][scan8[0] - 1];
1069             int refb = h->ref_cache[list][scan8[0] - 8];
1070             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1071             if(refc == PART_NOT_AVAILABLE)
1072                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1073             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1074             if(ref[list] < 0)
1075                 ref[list] = -1;
1076         }
1077
1078         if(ref[0] < 0 && ref[1] < 0){
1079             ref[0] = ref[1] = 0;
1080             mv[0][0] = mv[0][1] =
1081             mv[1][0] = mv[1][1] = 0;
1082         }else{
1083             for(list=0; list<2; list++){
1084                 if(ref[list] >= 0)
1085                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1086                 else
1087                     mv[list][0] = mv[list][1] = 0;
1088             }
1089         }
1090
1091         if(ref[1] < 0){
1092             if(!is_b8x8)
1093                 *mb_type &= ~MB_TYPE_L1;
1094             sub_mb_type &= ~MB_TYPE_L1;
1095         }else if(ref[0] < 0){
1096             if(!is_b8x8)
1097                 *mb_type &= ~MB_TYPE_L0;
1098             sub_mb_type &= ~MB_TYPE_L0;
1099         }
1100
1101         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1102             for(i8=0; i8<4; i8++){
1103                 int x8 = i8&1;
1104                 int y8 = i8>>1;
1105                 int xy8 = x8+y8*b8_stride;
1106                 int xy4 = 3*x8+y8*b4_stride;
1107                 int a=0, b=0;
1108
1109                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1110                     continue;
1111                 h->sub_mb_type[i8] = sub_mb_type;
1112
1113                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1114                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1115                 if(!IS_INTRA(mb_type_col[y8])
1116                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1117                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1118                     if(ref[0] > 0)
1119                         a= pack16to32(mv[0][0],mv[0][1]);
1120                     if(ref[1] > 0)
1121                         b= pack16to32(mv[1][0],mv[1][1]);
1122                 }else{
1123                     a= pack16to32(mv[0][0],mv[0][1]);
1124                     b= pack16to32(mv[1][0],mv[1][1]);
1125                 }
1126                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1127                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1128             }
1129         }else if(IS_16X16(*mb_type)){
1130             int a=0, b=0;
1131
1132             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1133             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1134             if(!IS_INTRA(mb_type_col[0])
1135                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1136                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1137                        && (h->x264_build>33 || !h->x264_build)))){
1138                 if(ref[0] > 0)
1139                     a= pack16to32(mv[0][0],mv[0][1]);
1140                 if(ref[1] > 0)
1141                     b= pack16to32(mv[1][0],mv[1][1]);
1142             }else{
1143                 a= pack16to32(mv[0][0],mv[0][1]);
1144                 b= pack16to32(mv[1][0],mv[1][1]);
1145             }
1146             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1147             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1148         }else{
1149             for(i8=0; i8<4; i8++){
1150                 const int x8 = i8&1;
1151                 const int y8 = i8>>1;
1152
1153                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1154                     continue;
1155                 h->sub_mb_type[i8] = sub_mb_type;
1156
1157                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1158                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1159                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1160                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1161
1162                 /* col_zero_flag */
1163                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1164                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1165                                                   && (h->x264_build>33 || !h->x264_build)))){
1166                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1167                     if(IS_SUB_8X8(sub_mb_type)){
1168                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1169                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1170                             if(ref[0] == 0)
1171                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1172                             if(ref[1] == 0)
1173                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                         }
1175                     }else
1176                     for(i4=0; i4<4; i4++){
1177                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1178                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1179                             if(ref[0] == 0)
1180                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1181                             if(ref[1] == 0)
1182                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1183                         }
1184                     }
1185                 }
1186             }
1187         }
1188     }else{ /* direct temporal mv pred */
1189         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1190         const int *dist_scale_factor = h->dist_scale_factor;
1191         int ref_offset= 0;
1192
1193         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1194             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1195             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1196             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1197         }
1198         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1199             ref_offset += 16;
1200
1201         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1202             /* FIXME assumes direct_8x8_inference == 1 */
1203             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1204
1205             for(i8=0; i8<4; i8++){
1206                 const int x8 = i8&1;
1207                 const int y8 = i8>>1;
1208                 int ref0, scale;
1209                 const int16_t (*l1mv)[2]= l1mv0;
1210
1211                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1212                     continue;
1213                 h->sub_mb_type[i8] = sub_mb_type;
1214
1215                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1216                 if(IS_INTRA(mb_type_col[y8])){
1217                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1219                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1220                     continue;
1221                 }
1222
1223                 ref0 = l1ref0[x8 + y8*b8_stride];
1224                 if(ref0 >= 0)
1225                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1226                 else{
1227                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1228                     l1mv= l1mv1;
1229                 }
1230                 scale = dist_scale_factor[ref0];
1231                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1232
1233                 {
1234                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1235                     int my_col = (mv_col[1]<<y_shift)/2;
1236                     int mx = (scale * mv_col[0] + 128) >> 8;
1237                     int my = (scale * my_col + 128) >> 8;
1238                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1239                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1240                 }
1241             }
1242             return;
1243         }
1244
1245         /* one-to-one mv scaling */
1246
1247         if(IS_16X16(*mb_type)){
1248             int ref, mv0, mv1;
1249
1250             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1251             if(IS_INTRA(mb_type_col[0])){
1252                 ref=mv0=mv1=0;
1253             }else{
1254                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1255                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1256                 const int scale = dist_scale_factor[ref0];
1257                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1258                 int mv_l0[2];
1259                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1260                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1261                 ref= ref0;
1262                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1263                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1264             }
1265             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1266             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1267             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1268         }else{
1269             for(i8=0; i8<4; i8++){
1270                 const int x8 = i8&1;
1271                 const int y8 = i8>>1;
1272                 int ref0, scale;
1273                 const int16_t (*l1mv)[2]= l1mv0;
1274
1275                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1276                     continue;
1277                 h->sub_mb_type[i8] = sub_mb_type;
1278                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1279                 if(IS_INTRA(mb_type_col[0])){
1280                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1282                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1283                     continue;
1284                 }
1285
1286                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1287                 if(ref0 >= 0)
1288                     ref0 = map_col_to_list0[0][ref0];
1289                 else{
1290                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1291                     l1mv= l1mv1;
1292                 }
1293                 scale = dist_scale_factor[ref0];
1294
1295                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1296                 if(IS_SUB_8X8(sub_mb_type)){
1297                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1298                     int mx = (scale * mv_col[0] + 128) >> 8;
1299                     int my = (scale * mv_col[1] + 128) >> 8;
1300                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1301                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1302                 }else
1303                 for(i4=0; i4<4; i4++){
1304                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1305                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1306                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1307                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1308                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1309                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1310                 }
1311             }
1312         }
1313     }
1314 }
1315
1316 static inline void write_back_motion(H264Context *h, int mb_type){
1317     MpegEncContext * const s = &h->s;
1318     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1319     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1320     int list;
1321
1322     if(!USES_LIST(mb_type, 0))
1323         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1324
1325     for(list=0; list<h->list_count; list++){
1326         int y;
1327         if(!USES_LIST(mb_type, list))
1328             continue;
1329
1330         for(y=0; y<4; y++){
1331             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1332             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1333         }
1334         if( h->pps.cabac ) {
1335             if(IS_SKIP(mb_type))
1336                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1337             else
1338             for(y=0; y<4; y++){
1339                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1340                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1341             }
1342         }
1343
1344         {
1345             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1346             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1347             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1348             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1349             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1350         }
1351     }
1352
1353     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1354         if(IS_8X8(mb_type)){
1355             uint8_t *direct_table = &h->direct_table[b8_xy];
1356             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1357             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1358             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1359         }
1360     }
1361 }
1362
1363 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1364     int i, si, di;
1365     uint8_t *dst;
1366     int bufidx;
1367
1368 //    src[0]&0x80;                //forbidden bit
1369     h->nal_ref_idc= src[0]>>5;
1370     h->nal_unit_type= src[0]&0x1F;
1371
1372     src++; length--;
1373 #if 0
1374     for(i=0; i<length; i++)
1375         printf("%2X ", src[i]);
1376 #endif
1377
1378 #if HAVE_FAST_UNALIGNED
1379 # if HAVE_FAST_64BIT
1380 #   define RS 7
1381     for(i=0; i+1<length; i+=9){
1382         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1383 # else
1384 #   define RS 3
1385     for(i=0; i+1<length; i+=5){
1386         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1387 # endif
1388             continue;
1389         if(i>0 && !src[i]) i--;
1390         while(src[i]) i++;
1391 #else
1392 #   define RS 0
1393     for(i=0; i+1<length; i+=2){
1394         if(src[i]) continue;
1395         if(i>0 && src[i-1]==0) i--;
1396 #endif
1397         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1398             if(src[i+2]!=3){
1399                 /* startcode, so we must be past the end */
1400                 length=i;
1401             }
1402             break;
1403         }
1404         i-= RS;
1405     }
1406
1407     if(i>=length-1){ //no escaped 0
1408         *dst_length= length;
1409         *consumed= length+1; //+1 for the header
1410         return src;
1411     }
1412
1413     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1414     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1415     dst= h->rbsp_buffer[bufidx];
1416
1417     if (dst == NULL){
1418         return NULL;
1419     }
1420
1421 //printf("decoding esc\n");
1422     memcpy(dst, src, i);
1423     si=di=i;
1424     while(si+2<length){
1425         //remove escapes (very rare 1:2^22)
1426         if(src[si+2]>3){
1427             dst[di++]= src[si++];
1428             dst[di++]= src[si++];
1429         }else if(src[si]==0 && src[si+1]==0){
1430             if(src[si+2]==3){ //escape
1431                 dst[di++]= 0;
1432                 dst[di++]= 0;
1433                 si+=3;
1434                 continue;
1435             }else //next start code
1436                 goto nsc;
1437         }
1438
1439         dst[di++]= src[si++];
1440     }
1441     while(si<length)
1442         dst[di++]= src[si++];
1443 nsc:
1444
1445     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1446
1447     *dst_length= di;
1448     *consumed= si + 1;//+1 for the header
1449 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1450     return dst;
1451 }
1452
1453 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1454     int v= *src;
1455     int r;
1456
1457     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1458
1459     for(r=1; r<9; r++){
1460         if(v&1) return r;
1461         v>>=1;
1462     }
1463     return 0;
1464 }
1465
1466 /**
1467  * IDCT transforms the 16 dc values and dequantizes them.
1468  * @param qp quantization parameter
1469  */
1470 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1471 #define stride 16
1472     int i;
1473     int temp[16]; //FIXME check if this is a good idea
1474     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1475     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1476
1477 //memset(block, 64, 2*256);
1478 //return;
1479     for(i=0; i<4; i++){
1480         const int offset= y_offset[i];
1481         const int z0= block[offset+stride*0] + block[offset+stride*4];
1482         const int z1= block[offset+stride*0] - block[offset+stride*4];
1483         const int z2= block[offset+stride*1] - block[offset+stride*5];
1484         const int z3= block[offset+stride*1] + block[offset+stride*5];
1485
1486         temp[4*i+0]= z0+z3;
1487         temp[4*i+1]= z1+z2;
1488         temp[4*i+2]= z1-z2;
1489         temp[4*i+3]= z0-z3;
1490     }
1491
1492     for(i=0; i<4; i++){
1493         const int offset= x_offset[i];
1494         const int z0= temp[4*0+i] + temp[4*2+i];
1495         const int z1= temp[4*0+i] - temp[4*2+i];
1496         const int z2= temp[4*1+i] - temp[4*3+i];
1497         const int z3= temp[4*1+i] + temp[4*3+i];
1498
1499         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1500         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1501         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1502         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1503     }
1504 }
1505
1506 #if 0
1507 /**
1508  * DCT transforms the 16 dc values.
1509  * @param qp quantization parameter ??? FIXME
1510  */
1511 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1512 //    const int qmul= dequant_coeff[qp][0];
1513     int i;
1514     int temp[16]; //FIXME check if this is a good idea
1515     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1516     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1517
1518     for(i=0; i<4; i++){
1519         const int offset= y_offset[i];
1520         const int z0= block[offset+stride*0] + block[offset+stride*4];
1521         const int z1= block[offset+stride*0] - block[offset+stride*4];
1522         const int z2= block[offset+stride*1] - block[offset+stride*5];
1523         const int z3= block[offset+stride*1] + block[offset+stride*5];
1524
1525         temp[4*i+0]= z0+z3;
1526         temp[4*i+1]= z1+z2;
1527         temp[4*i+2]= z1-z2;
1528         temp[4*i+3]= z0-z3;
1529     }
1530
1531     for(i=0; i<4; i++){
1532         const int offset= x_offset[i];
1533         const int z0= temp[4*0+i] + temp[4*2+i];
1534         const int z1= temp[4*0+i] - temp[4*2+i];
1535         const int z2= temp[4*1+i] - temp[4*3+i];
1536         const int z3= temp[4*1+i] + temp[4*3+i];
1537
1538         block[stride*0 +offset]= (z0 + z3)>>1;
1539         block[stride*2 +offset]= (z1 + z2)>>1;
1540         block[stride*8 +offset]= (z1 - z2)>>1;
1541         block[stride*10+offset]= (z0 - z3)>>1;
1542     }
1543 }
1544 #endif
1545
1546 #undef xStride
1547 #undef stride
1548
1549 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1550     const int stride= 16*2;
1551     const int xStride= 16;
1552     int a,b,c,d,e;
1553
1554     a= block[stride*0 + xStride*0];
1555     b= block[stride*0 + xStride*1];
1556     c= block[stride*1 + xStride*0];
1557     d= block[stride*1 + xStride*1];
1558
1559     e= a-b;
1560     a= a+b;
1561     b= c-d;
1562     c= c+d;
1563
1564     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1565     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1566     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1567     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1568 }
1569
1570 #if 0
1571 static void chroma_dc_dct_c(DCTELEM *block){
1572     const int stride= 16*2;
1573     const int xStride= 16;
1574     int a,b,c,d,e;
1575
1576     a= block[stride*0 + xStride*0];
1577     b= block[stride*0 + xStride*1];
1578     c= block[stride*1 + xStride*0];
1579     d= block[stride*1 + xStride*1];
1580
1581     e= a-b;
1582     a= a+b;
1583     b= c-d;
1584     c= c+d;
1585
1586     block[stride*0 + xStride*0]= (a+c);
1587     block[stride*0 + xStride*1]= (e+b);
1588     block[stride*1 + xStride*0]= (a-c);
1589     block[stride*1 + xStride*1]= (e-b);
1590 }
1591 #endif
1592
1593 /**
1594  * gets the chroma qp.
1595  */
1596 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1597     return h->pps.chroma_qp_table[t][qscale];
1598 }
1599
1600 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1601                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1602                            int src_x_offset, int src_y_offset,
1603                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1604     MpegEncContext * const s = &h->s;
1605     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1606     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1607     const int luma_xy= (mx&3) + ((my&3)<<2);
1608     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1609     uint8_t * src_cb, * src_cr;
1610     int extra_width= h->emu_edge_width;
1611     int extra_height= h->emu_edge_height;
1612     int emu=0;
1613     const int full_mx= mx>>2;
1614     const int full_my= my>>2;
1615     const int pic_width  = 16*s->mb_width;
1616     const int pic_height = 16*s->mb_height >> MB_FIELD;
1617
1618     if(mx&7) extra_width -= 3;
1619     if(my&7) extra_height -= 3;
1620
1621     if(   full_mx < 0-extra_width
1622        || full_my < 0-extra_height
1623        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1624        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1625         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1626             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1627         emu=1;
1628     }
1629
1630     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1631     if(!square){
1632         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1633     }
1634
1635     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1636
1637     if(MB_FIELD){
1638         // chroma offset when predicting from a field of opposite parity
1639         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1640         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1641     }
1642     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1643     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1644
1645     if(emu){
1646         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1647             src_cb= s->edge_emu_buffer;
1648     }
1649     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1650
1651     if(emu){
1652         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1653             src_cr= s->edge_emu_buffer;
1654     }
1655     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1656 }
1657
1658 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1659                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1660                            int x_offset, int y_offset,
1661                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1662                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1663                            int list0, int list1){
1664     MpegEncContext * const s = &h->s;
1665     qpel_mc_func *qpix_op=  qpix_put;
1666     h264_chroma_mc_func chroma_op= chroma_put;
1667
1668     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1669     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1670     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1671     x_offset += 8*s->mb_x;
1672     y_offset += 8*(s->mb_y >> MB_FIELD);
1673
1674     if(list0){
1675         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1676         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1677                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1678                            qpix_op, chroma_op);
1679
1680         qpix_op=  qpix_avg;
1681         chroma_op= chroma_avg;
1682     }
1683
1684     if(list1){
1685         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1686         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1687                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1688                            qpix_op, chroma_op);
1689     }
1690 }
1691
1692 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1693                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1694                            int x_offset, int y_offset,
1695                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1696                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1697                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1698                            int list0, int list1){
1699     MpegEncContext * const s = &h->s;
1700
1701     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1702     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1703     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1704     x_offset += 8*s->mb_x;
1705     y_offset += 8*(s->mb_y >> MB_FIELD);
1706
1707     if(list0 && list1){
1708         /* don't optimize for luma-only case, since B-frames usually
1709          * use implicit weights => chroma too. */
1710         uint8_t *tmp_cb = s->obmc_scratchpad;
1711         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1712         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1713         int refn0 = h->ref_cache[0][ scan8[n] ];
1714         int refn1 = h->ref_cache[1][ scan8[n] ];
1715
1716         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1717                     dest_y, dest_cb, dest_cr,
1718                     x_offset, y_offset, qpix_put, chroma_put);
1719         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1720                     tmp_y, tmp_cb, tmp_cr,
1721                     x_offset, y_offset, qpix_put, chroma_put);
1722
1723         if(h->use_weight == 2){
1724             int weight0 = h->implicit_weight[refn0][refn1];
1725             int weight1 = 64 - weight0;
1726             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1727             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1729         }else{
1730             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1731                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1732                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1733             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1734                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1735                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1736             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1737                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1738                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1739         }
1740     }else{
1741         int list = list1 ? 1 : 0;
1742         int refn = h->ref_cache[list][ scan8[n] ];
1743         Picture *ref= &h->ref_list[list][refn];
1744         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1745                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1746                     qpix_put, chroma_put);
1747
1748         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1749                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1750         if(h->use_weight_chroma){
1751             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1752                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1753             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1755         }
1756     }
1757 }
1758
1759 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1760                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1761                            int x_offset, int y_offset,
1762                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1763                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1764                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1765                            int list0, int list1){
1766     if((h->use_weight==2 && list0 && list1
1767         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1768        || h->use_weight==1)
1769         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1770                          x_offset, y_offset, qpix_put, chroma_put,
1771                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1772     else
1773         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1774                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1775 }
1776
1777 static inline void prefetch_motion(H264Context *h, int list){
1778     /* fetch pixels for estimated mv 4 macroblocks ahead
1779      * optimized for 64byte cache lines */
1780     MpegEncContext * const s = &h->s;
1781     const int refn = h->ref_cache[list][scan8[0]];
1782     if(refn >= 0){
1783         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1784         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1785         uint8_t **src= h->ref_list[list][refn].data;
1786         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1787         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1788         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1789         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1790     }
1791 }
1792
1793 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1794                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1795                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1796                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1797     MpegEncContext * const s = &h->s;
1798     const int mb_xy= h->mb_xy;
1799     const int mb_type= s->current_picture.mb_type[mb_xy];
1800
1801     assert(IS_INTER(mb_type));
1802
1803     prefetch_motion(h, 0);
1804
1805     if(IS_16X16(mb_type)){
1806         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1807                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1808                 &weight_op[0], &weight_avg[0],
1809                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1810     }else if(IS_16X8(mb_type)){
1811         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1812                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1813                 &weight_op[1], &weight_avg[1],
1814                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1815         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1816                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1817                 &weight_op[1], &weight_avg[1],
1818                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1819     }else if(IS_8X16(mb_type)){
1820         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1821                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1822                 &weight_op[2], &weight_avg[2],
1823                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1824         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1825                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1826                 &weight_op[2], &weight_avg[2],
1827                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1828     }else{
1829         int i;
1830
1831         assert(IS_8X8(mb_type));
1832
1833         for(i=0; i<4; i++){
1834             const int sub_mb_type= h->sub_mb_type[i];
1835             const int n= 4*i;
1836             int x_offset= (i&1)<<2;
1837             int y_offset= (i&2)<<1;
1838
1839             if(IS_SUB_8X8(sub_mb_type)){
1840                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1841                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1842                     &weight_op[3], &weight_avg[3],
1843                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1844             }else if(IS_SUB_8X4(sub_mb_type)){
1845                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1846                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1847                     &weight_op[4], &weight_avg[4],
1848                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1849                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1850                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1851                     &weight_op[4], &weight_avg[4],
1852                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1853             }else if(IS_SUB_4X8(sub_mb_type)){
1854                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1855                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1856                     &weight_op[5], &weight_avg[5],
1857                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1858                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1859                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1860                     &weight_op[5], &weight_avg[5],
1861                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862             }else{
1863                 int j;
1864                 assert(IS_SUB_4X4(sub_mb_type));
1865                 for(j=0; j<4; j++){
1866                     int sub_x_offset= x_offset + 2*(j&1);
1867                     int sub_y_offset= y_offset +   (j&2);
1868                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1869                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1870                         &weight_op[6], &weight_avg[6],
1871                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1872                 }
1873             }
1874         }
1875     }
1876
1877     prefetch_motion(h, 1);
1878 }
1879
1880 static av_cold void init_cavlc_level_tab(void){
1881     int suffix_length, mask;
1882     unsigned int i;
1883
1884     for(suffix_length=0; suffix_length<7; suffix_length++){
1885         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1886             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1887             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1888
1889             mask= -(level_code&1);
1890             level_code= (((2+level_code)>>1) ^ mask) - mask;
1891             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1892                 cavlc_level_tab[suffix_length][i][0]= level_code;
1893                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1894             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1895                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1896                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1897             }else{
1898                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1899                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1900             }
1901         }
1902     }
1903 }
1904
1905 static av_cold void decode_init_vlc(void){
1906     static int done = 0;
1907
1908     if (!done) {
1909         int i;
1910         int offset;
1911         done = 1;
1912
1913         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1914         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1915         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1916                  &chroma_dc_coeff_token_len [0], 1, 1,
1917                  &chroma_dc_coeff_token_bits[0], 1, 1,
1918                  INIT_VLC_USE_NEW_STATIC);
1919
1920         offset = 0;
1921         for(i=0; i<4; i++){
1922             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1923             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1924             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1925                      &coeff_token_len [i][0], 1, 1,
1926                      &coeff_token_bits[i][0], 1, 1,
1927                      INIT_VLC_USE_NEW_STATIC);
1928             offset += coeff_token_vlc_tables_size[i];
1929         }
1930         /*
1931          * This is a one time safety check to make sure that
1932          * the packed static coeff_token_vlc table sizes
1933          * were initialized correctly.
1934          */
1935         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1936
1937         for(i=0; i<3; i++){
1938             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1939             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1940             init_vlc(&chroma_dc_total_zeros_vlc[i],
1941                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1942                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1943                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1944                      INIT_VLC_USE_NEW_STATIC);
1945         }
1946         for(i=0; i<15; i++){
1947             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1948             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1949             init_vlc(&total_zeros_vlc[i],
1950                      TOTAL_ZEROS_VLC_BITS, 16,
1951                      &total_zeros_len [i][0], 1, 1,
1952                      &total_zeros_bits[i][0], 1, 1,
1953                      INIT_VLC_USE_NEW_STATIC);
1954         }
1955
1956         for(i=0; i<6; i++){
1957             run_vlc[i].table = run_vlc_tables[i];
1958             run_vlc[i].table_allocated = run_vlc_tables_size;
1959             init_vlc(&run_vlc[i],
1960                      RUN_VLC_BITS, 7,
1961                      &run_len [i][0], 1, 1,
1962                      &run_bits[i][0], 1, 1,
1963                      INIT_VLC_USE_NEW_STATIC);
1964         }
1965         run7_vlc.table = run7_vlc_table,
1966         run7_vlc.table_allocated = run7_vlc_table_size;
1967         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1968                  &run_len [6][0], 1, 1,
1969                  &run_bits[6][0], 1, 1,
1970                  INIT_VLC_USE_NEW_STATIC);
1971
1972         init_cavlc_level_tab();
1973     }
1974 }
1975
1976 static void free_tables(H264Context *h){
1977     int i;
1978     H264Context *hx;
1979     av_freep(&h->intra4x4_pred_mode);
1980     av_freep(&h->chroma_pred_mode_table);
1981     av_freep(&h->cbp_table);
1982     av_freep(&h->mvd_table[0]);
1983     av_freep(&h->mvd_table[1]);
1984     av_freep(&h->direct_table);
1985     av_freep(&h->non_zero_count);
1986     av_freep(&h->slice_table_base);
1987     h->slice_table= NULL;
1988
1989     av_freep(&h->mb2b_xy);
1990     av_freep(&h->mb2b8_xy);
1991
1992     for(i = 0; i < h->s.avctx->thread_count; i++) {
1993         hx = h->thread_context[i];
1994         if(!hx) continue;
1995         av_freep(&hx->top_borders[1]);
1996         av_freep(&hx->top_borders[0]);
1997         av_freep(&hx->s.obmc_scratchpad);
1998     }
1999 }
2000
2001 static void init_dequant8_coeff_table(H264Context *h){
2002     int i,q,x;
2003     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2004     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2005     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2006
2007     for(i=0; i<2; i++ ){
2008         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2009             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2010             break;
2011         }
2012
2013         for(q=0; q<52; q++){
2014             int shift = div6[q];
2015             int idx = rem6[q];
2016             for(x=0; x<64; x++)
2017                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2018                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2019                     h->pps.scaling_matrix8[i][x]) << shift;
2020         }
2021     }
2022 }
2023
2024 static void init_dequant4_coeff_table(H264Context *h){
2025     int i,j,q,x;
2026     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2027     for(i=0; i<6; i++ ){
2028         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2029         for(j=0; j<i; j++){
2030             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2031                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2032                 break;
2033             }
2034         }
2035         if(j<i)
2036             continue;
2037
2038         for(q=0; q<52; q++){
2039             int shift = div6[q] + 2;
2040             int idx = rem6[q];
2041             for(x=0; x<16; x++)
2042                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2043                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2044                     h->pps.scaling_matrix4[i][x]) << shift;
2045         }
2046     }
2047 }
2048
2049 static void init_dequant_tables(H264Context *h){
2050     int i,x;
2051     init_dequant4_coeff_table(h);
2052     if(h->pps.transform_8x8_mode)
2053         init_dequant8_coeff_table(h);
2054     if(h->sps.transform_bypass){
2055         for(i=0; i<6; i++)
2056             for(x=0; x<16; x++)
2057                 h->dequant4_coeff[i][0][x] = 1<<6;
2058         if(h->pps.transform_8x8_mode)
2059             for(i=0; i<2; i++)
2060                 for(x=0; x<64; x++)
2061                     h->dequant8_coeff[i][0][x] = 1<<6;
2062     }
2063 }
2064
2065
2066 /**
2067  * allocates tables.
2068  * needs width/height
2069  */
2070 static int alloc_tables(H264Context *h){
2071     MpegEncContext * const s = &h->s;
2072     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2073     int x,y;
2074
2075     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2076
2077     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2078     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2079     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2080
2081     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2082     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2083     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2084     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2085
2086     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2087     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2088
2089     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2090     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2091     for(y=0; y<s->mb_height; y++){
2092         for(x=0; x<s->mb_width; x++){
2093             const int mb_xy= x + y*s->mb_stride;
2094             const int b_xy = 4*x + 4*y*h->b_stride;
2095             const int b8_xy= 2*x + 2*y*h->b8_stride;
2096
2097             h->mb2b_xy [mb_xy]= b_xy;
2098             h->mb2b8_xy[mb_xy]= b8_xy;
2099         }
2100     }
2101
2102     s->obmc_scratchpad = NULL;
2103
2104     if(!h->dequant4_coeff[0])
2105         init_dequant_tables(h);
2106
2107     return 0;
2108 fail:
2109     free_tables(h);
2110     return -1;
2111 }
2112
2113 /**
2114  * Mimic alloc_tables(), but for every context thread.
2115  */
2116 static void clone_tables(H264Context *dst, H264Context *src){
2117     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2118     dst->non_zero_count           = src->non_zero_count;
2119     dst->slice_table              = src->slice_table;
2120     dst->cbp_table                = src->cbp_table;
2121     dst->mb2b_xy                  = src->mb2b_xy;
2122     dst->mb2b8_xy                 = src->mb2b8_xy;
2123     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2124     dst->mvd_table[0]             = src->mvd_table[0];
2125     dst->mvd_table[1]             = src->mvd_table[1];
2126     dst->direct_table             = src->direct_table;
2127
2128     dst->s.obmc_scratchpad = NULL;
2129     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2130 }
2131
2132 /**
2133  * Init context
2134  * Allocate buffers which are not shared amongst multiple threads.
2135  */
2136 static int context_init(H264Context *h){
2137     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2138     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2139
2140     return 0;
2141 fail:
2142     return -1; // free_tables will clean up for us
2143 }
2144
2145 static av_cold void common_init(H264Context *h){
2146     MpegEncContext * const s = &h->s;
2147
2148     s->width = s->avctx->width;
2149     s->height = s->avctx->height;
2150     s->codec_id= s->avctx->codec->id;
2151
2152     ff_h264_pred_init(&h->hpc, s->codec_id);
2153
2154     h->dequant_coeff_pps= -1;
2155     s->unrestricted_mv=1;
2156     s->decode=1; //FIXME
2157
2158     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2159
2160     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2161     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2162 }
2163
2164 /**
2165  * Reset SEI values at the beginning of the frame.
2166  *
2167  * @param h H.264 context.
2168  */
2169 static void reset_sei(H264Context *h) {
2170     h->sei_recovery_frame_cnt       = -1;
2171     h->sei_dpb_output_delay         =  0;
2172     h->sei_cpb_removal_delay        = -1;
2173     h->sei_buffering_period_present =  0;
2174 }
2175
2176 static av_cold int decode_init(AVCodecContext *avctx){
2177     H264Context *h= avctx->priv_data;
2178     MpegEncContext * const s = &h->s;
2179
2180     MPV_decode_defaults(s);
2181
2182     s->avctx = avctx;
2183     common_init(h);
2184
2185     s->out_format = FMT_H264;
2186     s->workaround_bugs= avctx->workaround_bugs;
2187
2188     // set defaults
2189 //    s->decode_mb= ff_h263_decode_mb;
2190     s->quarter_sample = 1;
2191     s->low_delay= 1;
2192
2193     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2194         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2195     else
2196         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2197     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2198
2199     decode_init_vlc();
2200
2201     if(avctx->extradata_size > 0 && avctx->extradata &&
2202        *(char *)avctx->extradata == 1){
2203         h->is_avc = 1;
2204         h->got_avcC = 0;
2205     } else {
2206         h->is_avc = 0;
2207     }
2208
2209     h->thread_context[0] = h;
2210     h->outputed_poc = INT_MIN;
2211     h->prev_poc_msb= 1<<16;
2212     reset_sei(h);
2213     if(avctx->codec_id == CODEC_ID_H264){
2214         if(avctx->ticks_per_frame == 1){
2215             s->avctx->time_base.den *=2;
2216         }
2217         avctx->ticks_per_frame = 2;
2218     }
2219     return 0;
2220 }
2221
2222 static int frame_start(H264Context *h){
2223     MpegEncContext * const s = &h->s;
2224     int i;
2225
2226     if(MPV_frame_start(s, s->avctx) < 0)
2227         return -1;
2228     ff_er_frame_start(s);
2229     /*
2230      * MPV_frame_start uses pict_type to derive key_frame.
2231      * This is incorrect for H.264; IDR markings must be used.
2232      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2233      * See decode_nal_units().
2234      */
2235     s->current_picture_ptr->key_frame= 0;
2236
2237     assert(s->linesize && s->uvlinesize);
2238
2239     for(i=0; i<16; i++){
2240         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2241         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2242     }
2243     for(i=0; i<4; i++){
2244         h->block_offset[16+i]=
2245         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2246         h->block_offset[24+16+i]=
2247         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2248     }
2249
2250     /* can't be in alloc_tables because linesize isn't known there.
2251      * FIXME: redo bipred weight to not require extra buffer? */
2252     for(i = 0; i < s->avctx->thread_count; i++)
2253         if(!h->thread_context[i]->s.obmc_scratchpad)
2254             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2255
2256     /* some macroblocks will be accessed before they're available */
2257     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2258         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2259
2260 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2261
2262     // We mark the current picture as non-reference after allocating it, so
2263     // that if we break out due to an error it can be released automatically
2264     // in the next MPV_frame_start().
2265     // SVQ3 as well as most other codecs have only last/next/current and thus
2266     // get released even with set reference, besides SVQ3 and others do not
2267     // mark frames as reference later "naturally".
2268     if(s->codec_id != CODEC_ID_SVQ3)
2269         s->current_picture_ptr->reference= 0;
2270
2271     s->current_picture_ptr->field_poc[0]=
2272     s->current_picture_ptr->field_poc[1]= INT_MAX;
2273     assert(s->current_picture_ptr->long_ref==0);
2274
2275     return 0;
2276 }
2277
2278 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2279     MpegEncContext * const s = &h->s;
2280     int i;
2281     int step    = 1;
2282     int offset  = 1;
2283     int uvoffset= 1;
2284     int top_idx = 1;
2285     int skiplast= 0;
2286
2287     src_y  -=   linesize;
2288     src_cb -= uvlinesize;
2289     src_cr -= uvlinesize;
2290
2291     if(!simple && FRAME_MBAFF){
2292         if(s->mb_y&1){
2293             offset  = MB_MBAFF ? 1 : 17;
2294             uvoffset= MB_MBAFF ? 1 : 9;
2295             if(!MB_MBAFF){
2296                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2297                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2298                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2299                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2300                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2301                 }
2302             }
2303         }else{
2304             if(!MB_MBAFF){
2305                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2306                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2307                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2308                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2309                 }
2310                 skiplast= 1;
2311             }
2312             offset  =
2313             uvoffset=
2314             top_idx = MB_MBAFF ? 0 : 1;
2315         }
2316         step= MB_MBAFF ? 2 : 1;
2317     }
2318
2319     // There are two lines saved, the line above the the top macroblock of a pair,
2320     // and the line above the bottom macroblock
2321     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2322     for(i=1; i<17 - skiplast; i++){
2323         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2324     }
2325
2326     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2327     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2328
2329     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2330         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2331         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2332         for(i=1; i<9 - skiplast; i++){
2333             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2334             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2335         }
2336         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2337         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2338     }
2339 }
2340
2341 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2342     MpegEncContext * const s = &h->s;
2343     int temp8, i;
2344     uint64_t temp64;
2345     int deblock_left;
2346     int deblock_top;
2347     int mb_xy;
2348     int step    = 1;
2349     int offset  = 1;
2350     int uvoffset= 1;
2351     int top_idx = 1;
2352
2353     if(!simple && FRAME_MBAFF){
2354         if(s->mb_y&1){
2355             offset  = MB_MBAFF ? 1 : 17;
2356             uvoffset= MB_MBAFF ? 1 : 9;
2357         }else{
2358             offset  =
2359             uvoffset=
2360             top_idx = MB_MBAFF ? 0 : 1;
2361         }
2362         step= MB_MBAFF ? 2 : 1;
2363     }
2364
2365     if(h->deblocking_filter == 2) {
2366         mb_xy = h->mb_xy;
2367         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2368         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2369     } else {
2370         deblock_left = (s->mb_x > 0);
2371         deblock_top =  (s->mb_y > !!MB_FIELD);
2372     }
2373
2374     src_y  -=   linesize + 1;
2375     src_cb -= uvlinesize + 1;
2376     src_cr -= uvlinesize + 1;
2377
2378 #define XCHG(a,b,t,xchg)\
2379 t= a;\
2380 if(xchg)\
2381     a= b;\
2382 b= t;
2383
2384     if(deblock_left){
2385         for(i = !deblock_top; i<16; i++){
2386             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2387         }
2388         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2389     }
2390
2391     if(deblock_top){
2392         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2393         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2394         if(s->mb_x+1 < s->mb_width){
2395             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2396         }
2397     }
2398
2399     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2400         if(deblock_left){
2401             for(i = !deblock_top; i<8; i++){
2402                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2403                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2404             }
2405             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2406             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2407         }
2408         if(deblock_top){
2409             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2410             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2411         }
2412     }
2413 }
2414
2415 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2416     MpegEncContext * const s = &h->s;
2417     const int mb_x= s->mb_x;
2418     const int mb_y= s->mb_y;
2419     const int mb_xy= h->mb_xy;
2420     const int mb_type= s->current_picture.mb_type[mb_xy];
2421     uint8_t  *dest_y, *dest_cb, *dest_cr;
2422     int linesize, uvlinesize /*dct_offset*/;
2423     int i;
2424     int *block_offset = &h->block_offset[0];
2425     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2426     /* is_h264 should always be true if SVQ3 is disabled. */
2427     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2428     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2429     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2430
2431     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2432     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2433     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2434
2435     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2436     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2437
2438     if (!simple && MB_FIELD) {
2439         linesize   = h->mb_linesize   = s->linesize * 2;
2440         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2441         block_offset = &h->block_offset[24];
2442         if(mb_y&1){ //FIXME move out of this function?
2443             dest_y -= s->linesize*15;
2444             dest_cb-= s->uvlinesize*7;
2445             dest_cr-= s->uvlinesize*7;
2446         }
2447         if(FRAME_MBAFF) {
2448             int list;
2449             for(list=0; list<h->list_count; list++){
2450                 if(!USES_LIST(mb_type, list))
2451                     continue;
2452                 if(IS_16X16(mb_type)){
2453                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2454                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2455                 }else{
2456                     for(i=0; i<16; i+=4){
2457                         int ref = h->ref_cache[list][scan8[i]];
2458                         if(ref >= 0)
2459                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2460                     }
2461                 }
2462             }
2463         }
2464     } else {
2465         linesize   = h->mb_linesize   = s->linesize;
2466         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2467 //        dct_offset = s->linesize * 16;
2468     }
2469
2470     if (!simple && IS_INTRA_PCM(mb_type)) {
2471         for (i=0; i<16; i++) {
2472             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2473         }
2474         for (i=0; i<8; i++) {
2475             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2476             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2477         }
2478     } else {
2479         if(IS_INTRA(mb_type)){
2480             if(h->deblocking_filter)
2481                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2482
2483             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2484                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2485                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2486             }
2487
2488             if(IS_INTRA4x4(mb_type)){
2489                 if(simple || !s->encoding){
2490                     if(IS_8x8DCT(mb_type)){
2491                         if(transform_bypass){
2492                             idct_dc_add =
2493                             idct_add    = s->dsp.add_pixels8;
2494                         }else{
2495                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2496                             idct_add    = s->dsp.h264_idct8_add;
2497                         }
2498                         for(i=0; i<16; i+=4){
2499                             uint8_t * const ptr= dest_y + block_offset[i];
2500                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2501                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2502                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2503                             }else{
2504                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2505                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2506                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2507                                 if(nnz){
2508                                     if(nnz == 1 && h->mb[i*16])
2509                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2510                                     else
2511                                         idct_add   (ptr, h->mb + i*16, linesize);
2512                                 }
2513                             }
2514                         }
2515                     }else{
2516                         if(transform_bypass){
2517                             idct_dc_add =
2518                             idct_add    = s->dsp.add_pixels4;
2519                         }else{
2520                             idct_dc_add = s->dsp.h264_idct_dc_add;
2521                             idct_add    = s->dsp.h264_idct_add;
2522                         }
2523                         for(i=0; i<16; i++){
2524                             uint8_t * const ptr= dest_y + block_offset[i];
2525                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2526
2527                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2528                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2529                             }else{
2530                                 uint8_t *topright;
2531                                 int nnz, tr;
2532                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2533                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2534                                     assert(mb_y || linesize <= block_offset[i]);
2535                                     if(!topright_avail){
2536                                         tr= ptr[3 - linesize]*0x01010101;
2537                                         topright= (uint8_t*) &tr;
2538                                     }else
2539                                         topright= ptr + 4 - linesize;
2540                                 }else
2541                                     topright= NULL;
2542
2543                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2544                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2545                                 if(nnz){
2546                                     if(is_h264){
2547                                         if(nnz == 1 && h->mb[i*16])
2548                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2549                                         else
2550                                             idct_add   (ptr, h->mb + i*16, linesize);
2551                                     }else
2552                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2553                                 }
2554                             }
2555                         }
2556                     }
2557                 }
2558             }else{
2559                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2560                 if(is_h264){
2561                     if(!transform_bypass)
2562                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2563                 }else
2564                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2565             }
2566             if(h->deblocking_filter)
2567                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2568         }else if(is_h264){
2569             hl_motion(h, dest_y, dest_cb, dest_cr,
2570                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2571                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2572                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2573         }
2574
2575
2576         if(!IS_INTRA4x4(mb_type)){
2577             if(is_h264){
2578                 if(IS_INTRA16x16(mb_type)){
2579                     if(transform_bypass){
2580                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2581                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2582                         }else{
2583                             for(i=0; i<16; i++){
2584                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2585                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2586                             }
2587                         }
2588                     }else{
2589                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2590                     }
2591                 }else if(h->cbp&15){
2592                     if(transform_bypass){
2593                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2594                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2595                         for(i=0; i<16; i+=di){
2596                             if(h->non_zero_count_cache[ scan8[i] ]){
2597                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2598                             }
2599                         }
2600                     }else{
2601                         if(IS_8x8DCT(mb_type)){
2602                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2603                         }else{
2604                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2605                         }
2606                     }
2607                 }
2608             }else{
2609                 for(i=0; i<16; i++){
2610                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2611                         uint8_t * const ptr= dest_y + block_offset[i];
2612                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2613                     }
2614                 }
2615             }
2616         }
2617
2618         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2619             uint8_t *dest[2] = {dest_cb, dest_cr};
2620             if(transform_bypass){
2621                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2622                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2623                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2624                 }else{
2625                     idct_add = s->dsp.add_pixels4;
2626                     for(i=16; i<16+8; i++){
2627                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2628                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2629                     }
2630                 }
2631             }else{
2632                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2633                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2634                 if(is_h264){
2635                     idct_add = s->dsp.h264_idct_add;
2636                     idct_dc_add = s->dsp.h264_idct_dc_add;
2637                     for(i=16; i<16+8; i++){
2638                         if(h->non_zero_count_cache[ scan8[i] ])
2639                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2640                         else if(h->mb[i*16])
2641                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2642                     }
2643                 }else{
2644                     for(i=16; i<16+8; i++){
2645                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2646                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2647                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2648                         }
2649                     }
2650                 }
2651             }
2652         }
2653     }
2654     if(h->cbp || IS_INTRA(mb_type))
2655         s->dsp.clear_blocks(h->mb);
2656
2657     if(h->deblocking_filter) {
2658         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2659         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2660         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2661         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2662         if (!simple && FRAME_MBAFF) {
2663             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2664         } else {
2665             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2666         }
2667     }
2668 }
2669
2670 /**
2671  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2672  */
2673 static void hl_decode_mb_simple(H264Context *h){
2674     hl_decode_mb_internal(h, 1);
2675 }
2676
2677 /**
2678  * Process a macroblock; this handles edge cases, such as interlacing.
2679  */
2680 static void av_noinline hl_decode_mb_complex(H264Context *h){
2681     hl_decode_mb_internal(h, 0);
2682 }
2683
2684 static void hl_decode_mb(H264Context *h){
2685     MpegEncContext * const s = &h->s;
2686     const int mb_xy= h->mb_xy;
2687     const int mb_type= s->current_picture.mb_type[mb_xy];
2688     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2689
2690     if (is_complex)
2691         hl_decode_mb_complex(h);
2692     else hl_decode_mb_simple(h);
2693 }
2694
2695 static void pic_as_field(Picture *pic, const int parity){
2696     int i;
2697     for (i = 0; i < 4; ++i) {
2698         if (parity == PICT_BOTTOM_FIELD)
2699             pic->data[i] += pic->linesize[i];
2700         pic->reference = parity;
2701         pic->linesize[i] *= 2;
2702     }
2703     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2704 }
2705
2706 static int split_field_copy(Picture *dest, Picture *src,
2707                             int parity, int id_add){
2708     int match = !!(src->reference & parity);
2709
2710     if (match) {
2711         *dest = *src;
2712         if(parity != PICT_FRAME){
2713             pic_as_field(dest, parity);
2714             dest->pic_id *= 2;
2715             dest->pic_id += id_add;
2716         }
2717     }
2718
2719     return match;
2720 }
2721
2722 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2723     int i[2]={0};
2724     int index=0;
2725
2726     while(i[0]<len || i[1]<len){
2727         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2728             i[0]++;
2729         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2730             i[1]++;
2731         if(i[0] < len){
2732             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2733             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2734         }
2735         if(i[1] < len){
2736             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2737             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2738         }
2739     }
2740
2741     return index;
2742 }
2743
2744 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2745     int i, best_poc;
2746     int out_i= 0;
2747
2748     for(;;){
2749         best_poc= dir ? INT_MIN : INT_MAX;
2750
2751         for(i=0; i<len; i++){
2752             const int poc= src[i]->poc;
2753             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2754                 best_poc= poc;
2755                 sorted[out_i]= src[i];
2756             }
2757         }
2758         if(best_poc == (dir ? INT_MIN : INT_MAX))
2759             break;
2760         limit= sorted[out_i++]->poc - dir;
2761     }
2762     return out_i;
2763 }
2764
2765 /**
2766  * fills the default_ref_list.
2767  */
2768 static int fill_default_ref_list(H264Context *h){
2769     MpegEncContext * const s = &h->s;
2770     int i, len;
2771
2772     if(h->slice_type_nos==FF_B_TYPE){
2773         Picture *sorted[32];
2774         int cur_poc, list;
2775         int lens[2];
2776
2777         if(FIELD_PICTURE)
2778             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2779         else
2780             cur_poc= s->current_picture_ptr->poc;
2781
2782         for(list= 0; list<2; list++){
2783             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2784             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2785             assert(len<=32);
2786             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2787             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2788             assert(len<=32);
2789
2790             if(len < h->ref_count[list])
2791                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2792             lens[list]= len;
2793         }
2794
2795         if(lens[0] == lens[1] && lens[1] > 1){
2796             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2797             if(i == lens[0])
2798                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2799         }
2800     }else{
2801         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2802         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2803         assert(len <= 32);
2804         if(len < h->ref_count[0])
2805             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2806     }
2807 #ifdef TRACE
2808     for (i=0; i<h->ref_count[0]; i++) {
2809         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2810     }
2811     if(h->slice_type_nos==FF_B_TYPE){
2812         for (i=0; i<h->ref_count[1]; i++) {
2813             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2814         }
2815     }
2816 #endif
2817     return 0;
2818 }
2819
2820 static void print_short_term(H264Context *h);
2821 static void print_long_term(H264Context *h);
2822
2823 /**
2824  * Extract structure information about the picture described by pic_num in
2825  * the current decoding context (frame or field). Note that pic_num is
2826  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2827  * @param pic_num picture number for which to extract structure information
2828  * @param structure one of PICT_XXX describing structure of picture
2829  *                      with pic_num
2830  * @return frame number (short term) or long term index of picture
2831  *         described by pic_num
2832  */
2833 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2834     MpegEncContext * const s = &h->s;
2835
2836     *structure = s->picture_structure;
2837     if(FIELD_PICTURE){
2838         if (!(pic_num & 1))
2839             /* opposite field */
2840             *structure ^= PICT_FRAME;
2841         pic_num >>= 1;
2842     }
2843
2844     return pic_num;
2845 }
2846
2847 static int decode_ref_pic_list_reordering(H264Context *h){
2848     MpegEncContext * const s = &h->s;
2849     int list, index, pic_structure;
2850
2851     print_short_term(h);
2852     print_long_term(h);
2853
2854     for(list=0; list<h->list_count; list++){
2855         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2856
2857         if(get_bits1(&s->gb)){
2858             int pred= h->curr_pic_num;
2859
2860             for(index=0; ; index++){
2861                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2862                 unsigned int pic_id;
2863                 int i;
2864                 Picture *ref = NULL;
2865
2866                 if(reordering_of_pic_nums_idc==3)
2867                     break;
2868
2869                 if(index >= h->ref_count[list]){
2870                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2871                     return -1;
2872                 }
2873
2874                 if(reordering_of_pic_nums_idc<3){
2875                     if(reordering_of_pic_nums_idc<2){
2876                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2877                         int frame_num;
2878
2879                         if(abs_diff_pic_num > h->max_pic_num){
2880                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2881                             return -1;
2882                         }
2883
2884                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2885                         else                                pred+= abs_diff_pic_num;
2886                         pred &= h->max_pic_num - 1;
2887
2888                         frame_num = pic_num_extract(h, pred, &pic_structure);
2889
2890                         for(i= h->short_ref_count-1; i>=0; i--){
2891                             ref = h->short_ref[i];
2892                             assert(ref->reference);
2893                             assert(!ref->long_ref);
2894                             if(
2895                                    ref->frame_num == frame_num &&
2896                                    (ref->reference & pic_structure)
2897                               )
2898                                 break;
2899                         }
2900                         if(i>=0)
2901                             ref->pic_id= pred;
2902                     }else{
2903                         int long_idx;
2904                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2905
2906                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2907
2908                         if(long_idx>31){
2909                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2910                             return -1;
2911                         }
2912                         ref = h->long_ref[long_idx];
2913                         assert(!(ref && !ref->reference));
2914                         if(ref && (ref->reference & pic_structure)){
2915                             ref->pic_id= pic_id;
2916                             assert(ref->long_ref);
2917                             i=0;
2918                         }else{
2919                             i=-1;
2920                         }
2921                     }
2922
2923                     if (i < 0) {
2924                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2925                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2926                     } else {
2927                         for(i=index; i+1<h->ref_count[list]; i++){
2928                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2929                                 break;
2930                         }
2931                         for(; i > index; i--){
2932                             h->ref_list[list][i]= h->ref_list[list][i-1];
2933                         }
2934                         h->ref_list[list][index]= *ref;
2935                         if (FIELD_PICTURE){
2936                             pic_as_field(&h->ref_list[list][index], pic_structure);
2937                         }
2938                     }
2939                 }else{
2940                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2941                     return -1;
2942                 }
2943             }
2944         }
2945     }
2946     for(list=0; list<h->list_count; list++){
2947         for(index= 0; index < h->ref_count[list]; index++){
2948             if(!h->ref_list[list][index].data[0]){
2949                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2950                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2951             }
2952         }
2953     }
2954
2955     return 0;
2956 }
2957
2958 static void fill_mbaff_ref_list(H264Context *h){
2959     int list, i, j;
2960     for(list=0; list<2; list++){ //FIXME try list_count
2961         for(i=0; i<h->ref_count[list]; i++){
2962             Picture *frame = &h->ref_list[list][i];
2963             Picture *field = &h->ref_list[list][16+2*i];
2964             field[0] = *frame;
2965             for(j=0; j<3; j++)
2966                 field[0].linesize[j] <<= 1;
2967             field[0].reference = PICT_TOP_FIELD;
2968             field[0].poc= field[0].field_poc[0];
2969             field[1] = field[0];
2970             for(j=0; j<3; j++)
2971                 field[1].data[j] += frame->linesize[j];
2972             field[1].reference = PICT_BOTTOM_FIELD;
2973             field[1].poc= field[1].field_poc[1];
2974
2975             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2976             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2977             for(j=0; j<2; j++){
2978                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2979                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2980             }
2981         }
2982     }
2983     for(j=0; j<h->ref_count[1]; j++){
2984         for(i=0; i<h->ref_count[0]; i++)
2985             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2986         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2987         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2988     }
2989 }
2990
2991 static int pred_weight_table(H264Context *h){
2992     MpegEncContext * const s = &h->s;
2993     int list, i;
2994     int luma_def, chroma_def;
2995
2996     h->use_weight= 0;
2997     h->use_weight_chroma= 0;
2998     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2999     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3000     luma_def = 1<<h->luma_log2_weight_denom;
3001     chroma_def = 1<<h->chroma_log2_weight_denom;
3002
3003     for(list=0; list<2; list++){
3004         h->luma_weight_flag[list]   = 0;
3005         h->chroma_weight_flag[list] = 0;
3006         for(i=0; i<h->ref_count[list]; i++){
3007             int luma_weight_flag, chroma_weight_flag;
3008
3009             luma_weight_flag= get_bits1(&s->gb);
3010             if(luma_weight_flag){
3011                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3012                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3013                 if(   h->luma_weight[list][i] != luma_def
3014                    || h->luma_offset[list][i] != 0) {
3015                     h->use_weight= 1;
3016                     h->luma_weight_flag[list]= 1;
3017                 }
3018             }else{
3019                 h->luma_weight[list][i]= luma_def;
3020                 h->luma_offset[list][i]= 0;
3021             }
3022
3023             if(CHROMA){
3024                 chroma_weight_flag= get_bits1(&s->gb);
3025                 if(chroma_weight_flag){
3026                     int j;
3027                     for(j=0; j<2; j++){
3028                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3029                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3030                         if(   h->chroma_weight[list][i][j] != chroma_def
3031                            || h->chroma_offset[list][i][j] != 0) {
3032                             h->use_weight_chroma= 1;
3033                             h->chroma_weight_flag[list]= 1;
3034                         }
3035                     }
3036                 }else{
3037                     int j;
3038                     for(j=0; j<2; j++){
3039                         h->chroma_weight[list][i][j]= chroma_def;
3040                         h->chroma_offset[list][i][j]= 0;
3041                     }
3042                 }
3043             }
3044         }
3045         if(h->slice_type_nos != FF_B_TYPE) break;
3046     }
3047     h->use_weight= h->use_weight || h->use_weight_chroma;
3048     return 0;
3049 }
3050
3051 static void implicit_weight_table(H264Context *h){
3052     MpegEncContext * const s = &h->s;
3053     int ref0, ref1, i;
3054     int cur_poc = s->current_picture_ptr->poc;
3055
3056     for (i = 0; i < 2; i++) {
3057         h->luma_weight_flag[i]   = 0;
3058         h->chroma_weight_flag[i] = 0;
3059     }
3060
3061     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3062        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3063         h->use_weight= 0;
3064         h->use_weight_chroma= 0;
3065         return;
3066     }
3067
3068     h->use_weight= 2;
3069     h->use_weight_chroma= 2;
3070     h->luma_log2_weight_denom= 5;
3071     h->chroma_log2_weight_denom= 5;
3072
3073     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3074         int poc0 = h->ref_list[0][ref0].poc;
3075         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3076             int poc1 = h->ref_list[1][ref1].poc;
3077             int td = av_clip(poc1 - poc0, -128, 127);
3078             if(td){
3079                 int tb = av_clip(cur_poc - poc0, -128, 127);
3080                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3081                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3082                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3083                     h->implicit_weight[ref0][ref1] = 32;
3084                 else
3085                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3086             }else
3087                 h->implicit_weight[ref0][ref1] = 32;
3088         }
3089     }
3090 }
3091
3092 /**
3093  * Mark a picture as no longer needed for reference. The refmask
3094  * argument allows unreferencing of individual fields or the whole frame.
3095  * If the picture becomes entirely unreferenced, but is being held for
3096  * display purposes, it is marked as such.
3097  * @param refmask mask of fields to unreference; the mask is bitwise
3098  *                anded with the reference marking of pic
3099  * @return non-zero if pic becomes entirely unreferenced (except possibly
3100  *         for display purposes) zero if one of the fields remains in
3101  *         reference
3102  */
3103 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3104     int i;
3105     if (pic->reference &= refmask) {
3106         return 0;
3107     } else {
3108         for(i = 0; h->delayed_pic[i]; i++)
3109             if(pic == h->delayed_pic[i]){
3110                 pic->reference=DELAYED_PIC_REF;
3111                 break;
3112             }
3113         return 1;
3114     }
3115 }
3116
3117 /**
3118  * instantaneous decoder refresh.
3119  */
3120 static void idr(H264Context *h){
3121     int i;
3122
3123     for(i=0; i<16; i++){
3124         remove_long(h, i, 0);
3125     }
3126     assert(h->long_ref_count==0);
3127
3128     for(i=0; i<h->short_ref_count; i++){
3129         unreference_pic(h, h->short_ref[i], 0);
3130         h->short_ref[i]= NULL;
3131     }
3132     h->short_ref_count=0;
3133     h->prev_frame_num= 0;
3134     h->prev_frame_num_offset= 0;
3135     h->prev_poc_msb=
3136     h->prev_poc_lsb= 0;
3137 }
3138
3139 /* forget old pics after a seek */
3140 static void flush_dpb(AVCodecContext *avctx){
3141     H264Context *h= avctx->priv_data;
3142     int i;
3143     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3144         if(h->delayed_pic[i])
3145             h->delayed_pic[i]->reference= 0;
3146         h->delayed_pic[i]= NULL;
3147     }
3148     h->outputed_poc= INT_MIN;
3149     idr(h);
3150     if(h->s.current_picture_ptr)
3151         h->s.current_picture_ptr->reference= 0;
3152     h->s.first_field= 0;
3153     reset_sei(h);
3154     ff_mpeg_flush(avctx);
3155 }
3156
3157 /**
3158  * Find a Picture in the short term reference list by frame number.
3159  * @param frame_num frame number to search for
3160  * @param idx the index into h->short_ref where returned picture is found
3161  *            undefined if no picture found.
3162  * @return pointer to the found picture, or NULL if no pic with the provided
3163  *                 frame number is found
3164  */
3165 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3166     MpegEncContext * const s = &h->s;
3167     int i;
3168
3169     for(i=0; i<h->short_ref_count; i++){
3170         Picture *pic= h->short_ref[i];
3171         if(s->avctx->debug&FF_DEBUG_MMCO)
3172             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3173         if(pic->frame_num == frame_num) {
3174             *idx = i;
3175             return pic;
3176         }
3177     }
3178     return NULL;
3179 }
3180
3181 /**
3182  * Remove a picture from the short term reference list by its index in
3183  * that list.  This does no checking on the provided index; it is assumed
3184  * to be valid. Other list entries are shifted down.
3185  * @param i index into h->short_ref of picture to remove.
3186  */
3187 static void remove_short_at_index(H264Context *h, int i){
3188     assert(i >= 0 && i < h->short_ref_count);
3189     h->short_ref[i]= NULL;
3190     if (--h->short_ref_count)
3191         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3192 }
3193
3194 /**
3195  *
3196  * @return the removed picture or NULL if an error occurs
3197  */
3198 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3199     MpegEncContext * const s = &h->s;
3200     Picture *pic;
3201     int i;
3202
3203     if(s->avctx->debug&FF_DEBUG_MMCO)
3204         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3205
3206     pic = find_short(h, frame_num, &i);
3207     if (pic){
3208         if(unreference_pic(h, pic, ref_mask))
3209         remove_short_at_index(h, i);
3210     }
3211
3212     return pic;
3213 }
3214
3215 /**
3216  * Remove a picture from the long term reference list by its index in
3217  * that list.
3218  * @return the removed picture or NULL if an error occurs
3219  */
3220 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3221     Picture *pic;
3222
3223     pic= h->long_ref[i];
3224     if (pic){
3225         if(unreference_pic(h, pic, ref_mask)){
3226             assert(h->long_ref[i]->long_ref == 1);
3227             h->long_ref[i]->long_ref= 0;
3228             h->long_ref[i]= NULL;
3229             h->long_ref_count--;
3230         }
3231     }
3232
3233     return pic;
3234 }
3235
3236 /**
3237  * print short term list
3238  */
3239 static void print_short_term(H264Context *h) {
3240     uint32_t i;
3241     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3242         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3243         for(i=0; i<h->short_ref_count; i++){
3244             Picture *pic= h->short_ref[i];
3245             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3246         }
3247     }
3248 }
3249
3250 /**
3251  * print long term list
3252  */
3253 static void print_long_term(H264Context *h) {
3254     uint32_t i;
3255     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3256         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3257         for(i = 0; i < 16; i++){
3258             Picture *pic= h->long_ref[i];
3259             if (pic) {
3260                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3261             }
3262         }
3263     }
3264 }
3265
3266 /**
3267  * Executes the reference picture marking (memory management control operations).
3268  */
3269 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3270     MpegEncContext * const s = &h->s;
3271     int i, j;
3272     int current_ref_assigned=0;
3273     Picture *av_uninit(pic);
3274
3275     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3276         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3277
3278     for(i=0; i<mmco_count; i++){
3279         int structure, av_uninit(frame_num);
3280         if(s->avctx->debug&FF_DEBUG_MMCO)
3281             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3282
3283         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3284            || mmco[i].opcode == MMCO_SHORT2LONG){
3285             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3286             pic = find_short(h, frame_num, &j);
3287             if(!pic){
3288                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3289                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3290                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3291                 continue;
3292             }
3293         }
3294
3295         switch(mmco[i].opcode){
3296         case MMCO_SHORT2UNUSED:
3297             if(s->avctx->debug&FF_DEBUG_MMCO)
3298                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3299             remove_short(h, frame_num, structure ^ PICT_FRAME);
3300             break;
3301         case MMCO_SHORT2LONG:
3302                 if (h->long_ref[mmco[i].long_arg] != pic)
3303                     remove_long(h, mmco[i].long_arg, 0);
3304
3305                 remove_short_at_index(h, j);
3306                 h->long_ref[ mmco[i].long_arg ]= pic;
3307                 if (h->long_ref[ mmco[i].long_arg ]){
3308                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3309                     h->long_ref_count++;
3310                 }
3311             break;
3312         case MMCO_LONG2UNUSED:
3313             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3314             pic = h->long_ref[j];
3315             if (pic) {
3316                 remove_long(h, j, structure ^ PICT_FRAME);
3317             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3318                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3319             break;
3320         case MMCO_LONG:
3321                     // Comment below left from previous code as it is an interresting note.
3322                     /* First field in pair is in short term list or
3323                      * at a different long term index.
3324                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3325                      * Report the problem and keep the pair where it is,
3326                      * and mark this field valid.
3327                      */
3328
3329             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3330                 remove_long(h, mmco[i].long_arg, 0);
3331
3332                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3333                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3334                 h->long_ref_count++;
3335             }
3336
3337             s->current_picture_ptr->reference |= s->picture_structure;
3338             current_ref_assigned=1;
3339             break;
3340         case MMCO_SET_MAX_LONG:
3341             assert(mmco[i].long_arg <= 16);
3342             // just remove the long term which index is greater than new max
3343             for(j = mmco[i].long_arg; j<16; j++){
3344                 remove_long(h, j, 0);
3345             }
3346             break;
3347         case MMCO_RESET:
3348             while(h->short_ref_count){
3349                 remove_short(h, h->short_ref[0]->frame_num, 0);
3350             }
3351             for(j = 0; j < 16; j++) {
3352                 remove_long(h, j, 0);
3353             }
3354             s->current_picture_ptr->poc=
3355             s->current_picture_ptr->field_poc[0]=
3356             s->current_picture_ptr->field_poc[1]=
3357             h->poc_lsb=
3358             h->poc_msb=
3359             h->frame_num=
3360             s->current_picture_ptr->frame_num= 0;
3361             break;
3362         default: assert(0);
3363         }
3364     }
3365
3366     if (!current_ref_assigned) {
3367         /* Second field of complementary field pair; the first field of
3368          * which is already referenced. If short referenced, it
3369          * should be first entry in short_ref. If not, it must exist
3370          * in long_ref; trying to put it on the short list here is an
3371          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3372          */
3373         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3374             /* Just mark the second field valid */
3375             s->current_picture_ptr->reference = PICT_FRAME;
3376         } else if (s->current_picture_ptr->long_ref) {
3377             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3378                                              "assignment for second field "
3379                                              "in complementary field pair "
3380                                              "(first field is long term)\n");
3381         } else {
3382             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3383             if(pic){
3384                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3385             }
3386
3387             if(h->short_ref_count)
3388                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3389
3390             h->short_ref[0]= s->current_picture_ptr;
3391             h->short_ref_count++;
3392             s->current_picture_ptr->reference |= s->picture_structure;
3393         }
3394     }
3395
3396     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3397
3398         /* We have too many reference frames, probably due to corrupted
3399          * stream. Need to discard one frame. Prevents overrun of the
3400          * short_ref and long_ref buffers.
3401          */
3402         av_log(h->s.avctx, AV_LOG_ERROR,
3403                "number of reference frames exceeds max (probably "
3404                "corrupt input), discarding one\n");
3405
3406         if (h->long_ref_count && !h->short_ref_count) {
3407             for (i = 0; i < 16; ++i)
3408                 if (h->long_ref[i])
3409                     break;
3410
3411             assert(i < 16);
3412             remove_long(h, i, 0);
3413         } else {
3414             pic = h->short_ref[h->short_ref_count - 1];
3415             remove_short(h, pic->frame_num, 0);
3416         }
3417     }
3418
3419     print_short_term(h);
3420     print_long_term(h);
3421     return 0;
3422 }
3423
3424 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3425     MpegEncContext * const s = &h->s;
3426     int i;
3427
3428     h->mmco_index= 0;
3429     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3430         s->broken_link= get_bits1(gb) -1;
3431         if(get_bits1(gb)){
3432             h->mmco[0].opcode= MMCO_LONG;
3433             h->mmco[0].long_arg= 0;
3434             h->mmco_index= 1;
3435         }
3436     }else{
3437         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3438             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3439                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3440
3441                 h->mmco[i].opcode= opcode;
3442                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3443                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3444 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3445                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3446                         return -1;
3447                     }*/
3448                 }
3449                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3450                     unsigned int long_arg= get_ue_golomb_31(gb);
3451                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3452                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3453                         return -1;
3454                     }
3455                     h->mmco[i].long_arg= long_arg;
3456                 }
3457
3458                 if(opcode > (unsigned)MMCO_LONG){
3459                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3460                     return -1;
3461                 }
3462                 if(opcode == MMCO_END)
3463                     break;
3464             }
3465             h->mmco_index= i;
3466         }else{
3467             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3468
3469             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3470                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3471                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3472                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3473                 h->mmco_index= 1;
3474                 if (FIELD_PICTURE) {
3475                     h->mmco[0].short_pic_num *= 2;
3476                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3477                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3478                     h->mmco_index= 2;
3479                 }
3480             }
3481         }
3482     }
3483
3484     return 0;
3485 }
3486
3487 static int init_poc(H264Context *h){
3488     MpegEncContext * const s = &h->s;
3489     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3490     int field_poc[2];
3491     Picture *cur = s->current_picture_ptr;
3492
3493     h->frame_num_offset= h->prev_frame_num_offset;
3494     if(h->frame_num < h->prev_frame_num)
3495         h->frame_num_offset += max_frame_num;
3496
3497     if(h->sps.poc_type==0){
3498         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3499
3500         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3501             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3502         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3503             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3504         else
3505             h->poc_msb = h->prev_poc_msb;
3506 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3507         field_poc[0] =
3508         field_poc[1] = h->poc_msb + h->poc_lsb;
3509         if(s->picture_structure == PICT_FRAME)
3510             field_poc[1] += h->delta_poc_bottom;
3511     }else if(h->sps.poc_type==1){
3512         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3513         int i;
3514
3515         if(h->sps.poc_cycle_length != 0)
3516             abs_frame_num = h->frame_num_offset + h->frame_num;
3517         else
3518             abs_frame_num = 0;
3519
3520         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3521             abs_frame_num--;
3522
3523         expected_delta_per_poc_cycle = 0;
3524         for(i=0; i < h->sps.poc_cycle_length; i++)
3525             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3526
3527         if(abs_frame_num > 0){
3528             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3529             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3530
3531             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3532             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3533                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3534         } else
3535             expectedpoc = 0;
3536
3537         if(h->nal_ref_idc == 0)
3538             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3539
3540         field_poc[0] = expectedpoc + h->delta_poc[0];
3541         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3542
3543         if(s->picture_structure == PICT_FRAME)
3544             field_poc[1] += h->delta_poc[1];
3545     }else{
3546         int poc= 2*(h->frame_num_offset + h->frame_num);
3547
3548         if(!h->nal_ref_idc)
3549             poc--;
3550
3551         field_poc[0]= poc;
3552         field_poc[1]= poc;
3553     }
3554
3555     if(s->picture_structure != PICT_BOTTOM_FIELD)
3556         s->current_picture_ptr->field_poc[0]= field_poc[0];
3557     if(s->picture_structure != PICT_TOP_FIELD)
3558         s->current_picture_ptr->field_poc[1]= field_poc[1];
3559     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3560
3561     return 0;
3562 }
3563
3564
3565 /**
3566  * initialize scan tables
3567  */
3568 static void init_scan_tables(H264Context *h){
3569     MpegEncContext * const s = &h->s;
3570     int i;
3571     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3572         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3573         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3574     }else{
3575         for(i=0; i<16; i++){
3576 #define T(x) (x>>2) | ((x<<2) & 0xF)
3577             h->zigzag_scan[i] = T(zigzag_scan[i]);
3578             h-> field_scan[i] = T( field_scan[i]);
3579 #undef T
3580         }
3581     }
3582     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3583         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3584         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3585         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3586         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3587     }else{
3588         for(i=0; i<64; i++){
3589 #define T(x) (x>>3) | ((x&7)<<3)
3590             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3591             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3592             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3593             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3594 #undef T
3595         }
3596     }
3597     if(h->sps.transform_bypass){ //FIXME same ugly
3598         h->zigzag_scan_q0          = zigzag_scan;
3599         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3600         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3601         h->field_scan_q0           = field_scan;
3602         h->field_scan8x8_q0        = field_scan8x8;
3603         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3604     }else{
3605         h->zigzag_scan_q0          = h->zigzag_scan;
3606         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3607         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3608         h->field_scan_q0           = h->field_scan;
3609         h->field_scan8x8_q0        = h->field_scan8x8;
3610         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3611     }
3612 }
3613
3614 /**
3615  * Replicates H264 "master" context to thread contexts.
3616  */
3617 static void clone_slice(H264Context *dst, H264Context *src)
3618 {
3619     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3620     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3621     dst->s.current_picture      = src->s.current_picture;
3622     dst->s.linesize             = src->s.linesize;
3623     dst->s.uvlinesize           = src->s.uvlinesize;
3624     dst->s.first_field          = src->s.first_field;
3625
3626     dst->prev_poc_msb           = src->prev_poc_msb;
3627     dst->prev_poc_lsb           = src->prev_poc_lsb;
3628     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3629     dst->prev_frame_num         = src->prev_frame_num;
3630     dst->short_ref_count        = src->short_ref_count;
3631
3632     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3633     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3634     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3635     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3636
3637     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3638     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3639 }
3640
3641 /**
3642  * decodes a slice header.
3643  * This will also call MPV_common_init() and frame_start() as needed.
3644  *
3645  * @param h h264context
3646  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3647  *
3648  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3649  */
3650 static int decode_slice_header(H264Context *h, H264Context *h0){
3651     MpegEncContext * const s = &h->s;
3652     MpegEncContext * const s0 = &h0->s;
3653     unsigned int first_mb_in_slice;
3654     unsigned int pps_id;
3655     int num_ref_idx_active_override_flag;
3656     unsigned int slice_type, tmp, i, j;
3657     int default_ref_list_done = 0;
3658     int last_pic_structure;
3659
3660     s->dropable= h->nal_ref_idc == 0;
3661
3662     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3663         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3664         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3665     }else{
3666         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3667         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3668     }
3669
3670     first_mb_in_slice= get_ue_golomb(&s->gb);
3671
3672     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3673         h0->current_slice = 0;
3674         if (!s0->first_field)
3675             s->current_picture_ptr= NULL;
3676     }
3677
3678     slice_type= get_ue_golomb_31(&s->gb);
3679     if(slice_type > 9){
3680         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3681         return -1;
3682     }
3683     if(slice_type > 4){
3684         slice_type -= 5;
3685         h->slice_type_fixed=1;
3686     }else
3687         h->slice_type_fixed=0;
3688
3689     slice_type= golomb_to_pict_type[ slice_type ];
3690     if (slice_type == FF_I_TYPE
3691         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3692         default_ref_list_done = 1;
3693     }
3694     h->slice_type= slice_type;
3695     h->slice_type_nos= slice_type & 3;
3696
3697     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3698     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3699         av_log(h->s.avctx, AV_LOG_ERROR,
3700                "B picture before any references, skipping\n");
3701         return -1;
3702     }
3703
3704     pps_id= get_ue_golomb(&s->gb);
3705     if(pps_id>=MAX_PPS_COUNT){
3706         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3707         return -1;
3708     }
3709     if(!h0->pps_buffers[pps_id]) {
3710         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3711         return -1;
3712     }
3713     h->pps= *h0->pps_buffers[pps_id];
3714
3715     if(!h0->sps_buffers[h->pps.sps_id]) {
3716         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3717         return -1;
3718     }
3719     h->sps = *h0->sps_buffers[h->pps.sps_id];
3720
3721     if(h == h0 && h->dequant_coeff_pps != pps_id){
3722         h->dequant_coeff_pps = pps_id;
3723         init_dequant_tables(h);
3724     }
3725
3726     s->mb_width= h->sps.mb_width;
3727     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3728
3729     h->b_stride=  s->mb_width*4;
3730     h->b8_stride= s->mb_width*2;
3731
3732     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3733     if(h->sps.frame_mbs_only_flag)
3734         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3735     else
3736         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3737
3738     if (s->context_initialized
3739         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3740         if(h != h0)
3741             return -1;   // width / height changed during parallelized decoding
3742         free_tables(h);
3743         flush_dpb(s->avctx);
3744         MPV_common_end(s);
3745     }
3746     if (!s->context_initialized) {
3747         if(h != h0)
3748             return -1;  // we cant (re-)initialize context during parallel decoding
3749         if (MPV_common_init(s) < 0)
3750             return -1;
3751         s->first_field = 0;
3752
3753         init_scan_tables(h);
3754         alloc_tables(h);
3755
3756         for(i = 1; i < s->avctx->thread_count; i++) {
3757             H264Context *c;
3758             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3759             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3760             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3761             c->sps = h->sps;
3762             c->pps = h->pps;
3763             init_scan_tables(c);
3764             clone_tables(c, h);
3765         }
3766
3767         for(i = 0; i < s->avctx->thread_count; i++)
3768             if(context_init(h->thread_context[i]) < 0)
3769                 return -1;
3770
3771         s->avctx->width = s->width;
3772         s->avctx->height = s->height;
3773         s->avctx->sample_aspect_ratio= h->sps.sar;
3774         if(!s->avctx->sample_aspect_ratio.den)
3775             s->avctx->sample_aspect_ratio.den = 1;
3776
3777         if(h->sps.timing_info_present_flag){
3778             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3779             if(h->x264_build > 0 && h->x264_build < 44)
3780                 s->avctx->time_base.den *= 2;
3781             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3782                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3783         }
3784     }
3785
3786     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3787
3788     h->mb_mbaff = 0;
3789     h->mb_aff_frame = 0;
3790     last_pic_structure = s0->picture_structure;
3791     if(h->sps.frame_mbs_only_flag){
3792         s->picture_structure= PICT_FRAME;
3793     }else{
3794         if(get_bits1(&s->gb)) { //field_pic_flag
3795             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3796         } else {
3797             s->picture_structure= PICT_FRAME;
3798             h->mb_aff_frame = h->sps.mb_aff;
3799         }
3800     }
3801     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3802
3803     if(h0->current_slice == 0){
3804         while(h->frame_num !=  h->prev_frame_num &&
3805               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3806             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3807             if (frame_start(h) < 0)
3808                 return -1;
3809             h->prev_frame_num++;
3810             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3811             s->current_picture_ptr->frame_num= h->prev_frame_num;
3812             execute_ref_pic_marking(h, NULL, 0);
3813         }
3814
3815         /* See if we have a decoded first field looking for a pair... */
3816         if (s0->first_field) {
3817             assert(s0->current_picture_ptr);
3818             assert(s0->current_picture_ptr->data[0]);
3819             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3820
3821             /* figure out if we have a complementary field pair */
3822             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3823                 /*
3824                  * Previous field is unmatched. Don't display it, but let it
3825                  * remain for reference if marked as such.
3826                  */
3827                 s0->current_picture_ptr = NULL;
3828                 s0->first_field = FIELD_PICTURE;
3829
3830             } else {
3831                 if (h->nal_ref_idc &&
3832                         s0->current_picture_ptr->reference &&
3833                         s0->current_picture_ptr->frame_num != h->frame_num) {
3834                     /*
3835                      * This and previous field were reference, but had
3836                      * different frame_nums. Consider this field first in
3837                      * pair. Throw away previous field except for reference
3838                      * purposes.
3839                      */
3840                     s0->first_field = 1;
3841                     s0->current_picture_ptr = NULL;
3842
3843                 } else {
3844                     /* Second field in complementary pair */
3845                     s0->first_field = 0;
3846                 }
3847             }
3848
3849         } else {
3850             /* Frame or first field in a potentially complementary pair */
3851             assert(!s0->current_picture_ptr);
3852             s0->first_field = FIELD_PICTURE;
3853         }
3854
3855         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3856             s0->first_field = 0;
3857             return -1;
3858         }
3859     }
3860     if(h != h0)
3861         clone_slice(h, h0);
3862
3863     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3864
3865     assert(s->mb_num == s->mb_width * s->mb_height);
3866     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3867        first_mb_in_slice                    >= s->mb_num){
3868         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3869         return -1;
3870     }
3871     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3872     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3873     if (s->picture_structure == PICT_BOTTOM_FIELD)
3874         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3875     assert(s->mb_y < s->mb_height);
3876
3877     if(s->picture_structure==PICT_FRAME){
3878         h->curr_pic_num=   h->frame_num;
3879         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3880     }else{
3881         h->curr_pic_num= 2*h->frame_num + 1;
3882         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3883     }
3884
3885     if(h->nal_unit_type == NAL_IDR_SLICE){
3886         get_ue_golomb(&s->gb); /* idr_pic_id */
3887     }
3888
3889     if(h->sps.poc_type==0){
3890         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3891
3892         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3893             h->delta_poc_bottom= get_se_golomb(&s->gb);
3894         }
3895     }
3896
3897     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3898         h->delta_poc[0]= get_se_golomb(&s->gb);
3899
3900         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3901             h->delta_poc[1]= get_se_golomb(&s->gb);
3902     }
3903
3904     init_poc(h);
3905
3906     if(h->pps.redundant_pic_cnt_present){
3907         h->redundant_pic_count= get_ue_golomb(&s->gb);
3908     }
3909
3910     //set defaults, might be overridden a few lines later
3911     h->ref_count[0]= h->pps.ref_count[0];
3912     h->ref_count[1]= h->pps.ref_count[1];
3913
3914     if(h->slice_type_nos != FF_I_TYPE){
3915         if(h->slice_type_nos == FF_B_TYPE){
3916             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3917         }
3918         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3919
3920         if(num_ref_idx_active_override_flag){
3921             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3922             if(h->slice_type_nos==FF_B_TYPE)
3923                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3924
3925             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3926                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3927                 h->ref_count[0]= h->ref_count[1]= 1;
3928                 return -1;
3929             }
3930         }
3931         if(h->slice_type_nos == FF_B_TYPE)
3932             h->list_count= 2;
3933         else
3934             h->list_count= 1;
3935     }else
3936         h->list_count= 0;
3937
3938     if(!default_ref_list_done){
3939         fill_default_ref_list(h);
3940     }
3941
3942     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3943         return -1;
3944
3945     if(h->slice_type_nos!=FF_I_TYPE){
3946         s->last_picture_ptr= &h->ref_list[0][0];
3947         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3948     }
3949     if(h->slice_type_nos==FF_B_TYPE){
3950         s->next_picture_ptr= &h->ref_list[1][0];
3951         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3952     }
3953
3954     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3955        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3956         pred_weight_table(h);
3957     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3958         implicit_weight_table(h);
3959     else {
3960         h->use_weight = 0;
3961         for (i = 0; i < 2; i++) {
3962             h->luma_weight_flag[i]   = 0;
3963             h->chroma_weight_flag[i] = 0;
3964         }
3965     }
3966
3967     if(h->nal_ref_idc)
3968         decode_ref_pic_marking(h0, &s->gb);
3969
3970     if(FRAME_MBAFF)
3971         fill_mbaff_ref_list(h);
3972
3973     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3974         direct_dist_scale_factor(h);
3975     direct_ref_list_init(h);
3976
3977     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3978         tmp = get_ue_golomb_31(&s->gb);
3979         if(tmp > 2){
3980             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3981             return -1;
3982         }
3983         h->cabac_init_idc= tmp;
3984     }
3985
3986     h->last_qscale_diff = 0;
3987     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3988     if(tmp>51){
3989         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3990         return -1;
3991     }
3992     s->qscale= tmp;
3993     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3994     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3995     //FIXME qscale / qp ... stuff
3996     if(h->slice_type == FF_SP_TYPE){
3997         get_bits1(&s->gb); /* sp_for_switch_flag */
3998     }
3999     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4000         get_se_golomb(&s->gb); /* slice_qs_delta */
4001     }
4002
4003     h->deblocking_filter = 1;
4004     h->slice_alpha_c0_offset = 0;
4005     h->slice_beta_offset = 0;
4006     if( h->pps.deblocking_filter_parameters_present ) {
4007         tmp= get_ue_golomb_31(&s->gb);
4008         if(tmp > 2){
4009             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4010             return -1;
4011         }
4012         h->deblocking_filter= tmp;
4013         if(h->deblocking_filter < 2)
4014             h->deblocking_filter^= 1; // 1<->0
4015
4016         if( h->deblocking_filter ) {
4017             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4018             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4019         }
4020     }
4021
4022     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4023        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4024        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4025        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4026         h->deblocking_filter= 0;
4027
4028     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4029         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4030             /* Cheat slightly for speed:
4031                Do not bother to deblock across slices. */
4032             h->deblocking_filter = 2;
4033         } else {
4034             h0->max_contexts = 1;
4035             if(!h0->single_decode_warning) {
4036                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4037                 h0->single_decode_warning = 1;
4038             }
4039             if(h != h0)
4040                 return 1; // deblocking switched inside frame
4041         }
4042     }
4043
4044 #if 0 //FMO
4045     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4046         slice_group_change_cycle= get_bits(&s->gb, ?);
4047 #endif
4048
4049     h0->last_slice_type = slice_type;
4050     h->slice_num = ++h0->current_slice;
4051     if(h->slice_num >= MAX_SLICES){
4052         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4053     }
4054
4055     for(j=0; j<2; j++){
4056         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4057         ref2frm[0]=
4058         ref2frm[1]= -1;
4059         for(i=0; i<16; i++)
4060             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4061                           +(h->ref_list[j][i].reference&3);
4062         ref2frm[18+0]=
4063         ref2frm[18+1]= -1;
4064         for(i=16; i<48; i++)
4065             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4066                           +(h->ref_list[j][i].reference&3);
4067     }
4068
4069     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4070     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4071
4072     s->avctx->refs= h->sps.ref_frame_count;
4073
4074     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4075         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4076                h->slice_num,
4077                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4078                first_mb_in_slice,
4079                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4080                pps_id, h->frame_num,
4081                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4082                h->ref_count[0], h->ref_count[1],
4083                s->qscale,
4084                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4085                h->use_weight,
4086                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4087                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4088                );
4089     }
4090
4091     return 0;
4092 }
4093
4094 /**
4095  *
4096  */
4097 static inline int get_level_prefix(GetBitContext *gb){
4098     unsigned int buf;
4099     int log;
4100
4101     OPEN_READER(re, gb);
4102     UPDATE_CACHE(re, gb);
4103     buf=GET_CACHE(re, gb);
4104
4105     log= 32 - av_log2(buf);
4106 #ifdef TRACE
4107     print_bin(buf>>(32-log), log);
4108     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4109 #endif
4110
4111     LAST_SKIP_BITS(re, gb, log);
4112     CLOSE_READER(re, gb);
4113
4114     return log-1;
4115 }
4116
4117 static inline int get_dct8x8_allowed(H264Context *h){
4118     if(h->sps.direct_8x8_inference_flag)
4119         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4120     else
4121         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4122 }
4123
4124 /**
4125  * decodes a residual block.
4126  * @param n block index
4127  * @param scantable scantable
4128  * @param max_coeff number of coefficients in the block
4129  * @return <0 if an error occurred
4130  */
4131 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4132     MpegEncContext * const s = &h->s;
4133     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4134     int level[16];
4135     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4136
4137     //FIXME put trailing_onex into the context
4138
4139     if(n == CHROMA_DC_BLOCK_INDEX){
4140         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4141         total_coeff= coeff_token>>2;
4142     }else{
4143         if(n == LUMA_DC_BLOCK_INDEX){
4144             total_coeff= pred_non_zero_count(h, 0);
4145             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4146             total_coeff= coeff_token>>2;
4147         }else{
4148             total_coeff= pred_non_zero_count(h, n);
4149             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4150             total_coeff= coeff_token>>2;
4151             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4152         }
4153     }
4154
4155     //FIXME set last_non_zero?
4156
4157     if(total_coeff==0)
4158         return 0;
4159     if(total_coeff > (unsigned)max_coeff) {
4160         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4161         return -1;
4162     }
4163
4164     trailing_ones= coeff_token&3;
4165     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4166     assert(total_coeff<=16);
4167
4168     i = show_bits(gb, 3);
4169     skip_bits(gb, trailing_ones);
4170     level[0] = 1-((i&4)>>1);
4171     level[1] = 1-((i&2)   );
4172     level[2] = 1-((i&1)<<1);
4173
4174     if(trailing_ones<total_coeff) {
4175         int mask, prefix;
4176         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4177         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4178         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4179
4180         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4181         if(level_code >= 100){
4182             prefix= level_code - 100;
4183             if(prefix == LEVEL_TAB_BITS)
4184                 prefix += get_level_prefix(gb);
4185
4186             //first coefficient has suffix_length equal to 0 or 1
4187             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4188                 if(suffix_length)
4189                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4190                 else
4191                     level_code= (prefix<<suffix_length); //part
4192             }else if(prefix==14){
4193                 if(suffix_length)
4194                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4195                 else
4196                     level_code= prefix + get_bits(gb, 4); //part
4197             }else{
4198                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4199                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4200                 if(prefix>=16)
4201                     level_code += (1<<(prefix-3))-4096;
4202             }
4203
4204             if(trailing_ones < 3) level_code += 2;
4205
4206             suffix_length = 2;
4207             mask= -(level_code&1);
4208             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4209         }else{
4210             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4211
4212             suffix_length = 1;
4213             if(level_code + 3U > 6U)
4214                 suffix_length++;
4215             level[trailing_ones]= level_code;
4216         }
4217
4218         //remaining coefficients have suffix_length > 0
4219         for(i=trailing_ones+1;i<total_coeff;i++) {
4220             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4221             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4222             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4223
4224             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4225             if(level_code >= 100){
4226                 prefix= level_code - 100;
4227                 if(prefix == LEVEL_TAB_BITS){
4228                     prefix += get_level_prefix(gb);
4229                 }
4230                 if(prefix<15){
4231                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4232                 }else{
4233                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4234                     if(prefix>=16)
4235                         level_code += (1<<(prefix-3))-4096;
4236                 }
4237                 mask= -(level_code&1);
4238                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4239             }
4240             level[i]= level_code;
4241
4242             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4243                 suffix_length++;
4244         }
4245     }
4246
4247     if(total_coeff == max_coeff)
4248         zeros_left=0;
4249     else{
4250         if(n == CHROMA_DC_BLOCK_INDEX)
4251             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4252         else
4253             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4254     }
4255
4256     coeff_num = zeros_left + total_coeff - 1;
4257     j = scantable[coeff_num];
4258     if(n > 24){
4259         block[j] = level[0];
4260         for(i=1;i<total_coeff;i++) {
4261             if(zeros_left <= 0)
4262                 run_before = 0;
4263             else if(zeros_left < 7){
4264                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4265             }else{
4266                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4267             }
4268             zeros_left -= run_before;
4269             coeff_num -= 1 + run_before;
4270             j= scantable[ coeff_num ];
4271
4272             block[j]= level[i];
4273         }
4274     }else{
4275         block[j] = (level[0] * qmul[j] + 32)>>6;
4276         for(i=1;i<total_coeff;i++) {
4277             if(zeros_left <= 0)
4278                 run_before = 0;
4279             else if(zeros_left < 7){
4280                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4281             }else{
4282                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4283             }
4284             zeros_left -= run_before;
4285             coeff_num -= 1 + run_before;
4286             j= scantable[ coeff_num ];
4287
4288             block[j]= (level[i] * qmul[j] + 32)>>6;
4289         }
4290     }
4291
4292     if(zeros_left<0){
4293         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4294         return -1;
4295     }
4296
4297     return 0;
4298 }
4299
4300 static void predict_field_decoding_flag(H264Context *h){
4301     MpegEncContext * const s = &h->s;
4302     const int mb_xy= h->mb_xy;
4303     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4304                 ? s->current_picture.mb_type[mb_xy-1]
4305                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4306                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4307                 : 0;
4308     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4309 }
4310
4311 /**
4312  * decodes a P_SKIP or B_SKIP macroblock
4313  */
4314 static void decode_mb_skip(H264Context *h){
4315     MpegEncContext * const s = &h->s;
4316     const int mb_xy= h->mb_xy;
4317     int mb_type=0;
4318
4319     memset(h->non_zero_count[mb_xy], 0, 16);
4320     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4321
4322     if(MB_FIELD)
4323         mb_type|= MB_TYPE_INTERLACED;
4324
4325     if( h->slice_type_nos == FF_B_TYPE )
4326     {
4327         // just for fill_caches. pred_direct_motion will set the real mb_type
4328         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4329
4330         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4331         pred_direct_motion(h, &mb_type);
4332         mb_type|= MB_TYPE_SKIP;
4333     }
4334     else
4335     {
4336         int mx, my;
4337         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4338
4339         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4340         pred_pskip_motion(h, &mx, &my);
4341         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4342         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4343     }
4344
4345     write_back_motion(h, mb_type);
4346     s->current_picture.mb_type[mb_xy]= mb_type;
4347     s->current_picture.qscale_table[mb_xy]= s->qscale;
4348     h->slice_table[ mb_xy ]= h->slice_num;
4349     h->prev_mb_skipped= 1;
4350 }
4351
4352 /**
4353  * decodes a macroblock
4354  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4355  */
4356 static int decode_mb_cavlc(H264Context *h){
4357     MpegEncContext * const s = &h->s;
4358     int mb_xy;
4359     int partition_count;
4360     unsigned int mb_type, cbp;
4361     int dct8x8_allowed= h->pps.transform_8x8_mode;
4362
4363     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4364
4365     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4366     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4367                 down the code */
4368     if(h->slice_type_nos != FF_I_TYPE){
4369         if(s->mb_skip_run==-1)
4370             s->mb_skip_run= get_ue_golomb(&s->gb);
4371
4372         if (s->mb_skip_run--) {
4373             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4374                 if(s->mb_skip_run==0)
4375                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4376                 else
4377                     predict_field_decoding_flag(h);
4378             }
4379             decode_mb_skip(h);
4380             return 0;
4381         }
4382     }
4383     if(FRAME_MBAFF){
4384         if( (s->mb_y&1) == 0 )
4385             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4386     }
4387
4388     h->prev_mb_skipped= 0;
4389
4390     mb_type= get_ue_golomb(&s->gb);
4391     if(h->slice_type_nos == FF_B_TYPE){
4392         if(mb_type < 23){
4393             partition_count= b_mb_type_info[mb_type].partition_count;
4394             mb_type=         b_mb_type_info[mb_type].type;
4395         }else{
4396             mb_type -= 23;
4397             goto decode_intra_mb;
4398         }
4399     }else if(h->slice_type_nos == FF_P_TYPE){
4400         if(mb_type < 5){
4401             partition_count= p_mb_type_info[mb_type].partition_count;
4402             mb_type=         p_mb_type_info[mb_type].type;
4403         }else{
4404             mb_type -= 5;
4405             goto decode_intra_mb;
4406         }
4407     }else{
4408        assert(h->slice_type_nos == FF_I_TYPE);
4409         if(h->slice_type == FF_SI_TYPE && mb_type)
4410             mb_type--;
4411 decode_intra_mb:
4412         if(mb_type > 25){
4413             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4414             return -1;
4415         }
4416         partition_count=0;
4417         cbp= i_mb_type_info[mb_type].cbp;
4418         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4419         mb_type= i_mb_type_info[mb_type].type;
4420     }
4421
4422     if(MB_FIELD)
4423         mb_type |= MB_TYPE_INTERLACED;
4424
4425     h->slice_table[ mb_xy ]= h->slice_num;
4426
4427     if(IS_INTRA_PCM(mb_type)){
4428         unsigned int x;
4429
4430         // We assume these blocks are very rare so we do not optimize it.
4431         align_get_bits(&s->gb);
4432
4433         // The pixels are stored in the same order as levels in h->mb array.
4434         for(x=0; x < (CHROMA ? 384 : 256); x++){
4435             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4436         }
4437
4438         // In deblocking, the quantizer is 0
4439         s->current_picture.qscale_table[mb_xy]= 0;
4440         // All coeffs are present
4441         memset(h->non_zero_count[mb_xy], 16, 16);
4442
4443         s->current_picture.mb_type[mb_xy]= mb_type;
4444         return 0;
4445     }
4446
4447     if(MB_MBAFF){
4448         h->ref_count[0] <<= 1;
4449         h->ref_count[1] <<= 1;
4450     }
4451
4452     fill_caches(h, mb_type, 0);
4453
4454     //mb_pred
4455     if(IS_INTRA(mb_type)){
4456         int pred_mode;
4457 //            init_top_left_availability(h);
4458         if(IS_INTRA4x4(mb_type)){
4459             int i;
4460             int di = 1;
4461             if(dct8x8_allowed && get_bits1(&s->gb)){
4462                 mb_type |= MB_TYPE_8x8DCT;
4463                 di = 4;
4464             }
4465
4466 //                fill_intra4x4_pred_table(h);
4467             for(i=0; i<16; i+=di){
4468                 int mode= pred_intra_mode(h, i);
4469
4470                 if(!get_bits1(&s->gb)){
4471                     const int rem_mode= get_bits(&s->gb, 3);
4472                     mode = rem_mode + (rem_mode >= mode);
4473                 }
4474
4475                 if(di==4)
4476                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4477                 else
4478                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4479             }
4480             write_back_intra_pred_mode(h);
4481             if( check_intra4x4_pred_mode(h) < 0)
4482                 return -1;
4483         }else{
4484             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4485             if(h->intra16x16_pred_mode < 0)
4486                 return -1;
4487         }
4488         if(CHROMA){
4489             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4490             if(pred_mode < 0)
4491                 return -1;
4492             h->chroma_pred_mode= pred_mode;
4493         }
4494     }else if(partition_count==4){
4495         int i, j, sub_partition_count[4], list, ref[2][4];
4496
4497         if(h->slice_type_nos == FF_B_TYPE){
4498             for(i=0; i<4; i++){
4499                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4500                 if(h->sub_mb_type[i] >=13){
4501                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4502                     return -1;
4503                 }
4504                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4505                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4506             }
4507             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4508                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4509                 pred_direct_motion(h, &mb_type);
4510                 h->ref_cache[0][scan8[4]] =
4511                 h->ref_cache[1][scan8[4]] =
4512                 h->ref_cache[0][scan8[12]] =
4513                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4514             }
4515         }else{
4516             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4517             for(i=0; i<4; i++){
4518                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4519                 if(h->sub_mb_type[i] >=4){
4520                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4521                     return -1;
4522                 }
4523                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4524                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4525             }
4526         }
4527
4528         for(list=0; list<h->list_count; list++){
4529             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4530             for(i=0; i<4; i++){
4531                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4532                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4533                     unsigned int tmp;
4534                     if(ref_count == 1){
4535                         tmp= 0;
4536                     }else if(ref_count == 2){
4537                         tmp= get_bits1(&s->gb)^1;
4538                     }else{
4539                         tmp= get_ue_golomb_31(&s->gb);
4540                         if(tmp>=ref_count){
4541                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4542                             return -1;
4543                         }
4544                     }
4545                     ref[list][i]= tmp;
4546                 }else{
4547                  //FIXME
4548                     ref[list][i] = -1;
4549                 }
4550             }
4551         }
4552
4553         if(dct8x8_allowed)
4554             dct8x8_allowed = get_dct8x8_allowed(h);
4555
4556         for(list=0; list<h->list_count; list++){
4557             for(i=0; i<4; i++){
4558                 if(IS_DIRECT(h->sub_mb_type[i])) {
4559                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4560                     continue;
4561                 }
4562                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4563                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4564
4565                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4566                     const int sub_mb_type= h->sub_mb_type[i];
4567                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4568                     for(j=0; j<sub_partition_count[i]; j++){
4569                         int mx, my;
4570                         const int index= 4*i + block_width*j;
4571                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4572                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4573                         mx += get_se_golomb(&s->gb);
4574                         my += get_se_golomb(&s->gb);
4575                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4576
4577                         if(IS_SUB_8X8(sub_mb_type)){
4578                             mv_cache[ 1 ][0]=
4579                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4580                             mv_cache[ 1 ][1]=
4581                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4582                         }else if(IS_SUB_8X4(sub_mb_type)){
4583                             mv_cache[ 1 ][0]= mx;
4584                             mv_cache[ 1 ][1]= my;
4585                         }else if(IS_SUB_4X8(sub_mb_type)){
4586                             mv_cache[ 8 ][0]= mx;
4587                             mv_cache[ 8 ][1]= my;
4588                         }
4589                         mv_cache[ 0 ][0]= mx;
4590                         mv_cache[ 0 ][1]= my;
4591                     }
4592                 }else{
4593                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4594                     p[0] = p[1]=
4595                     p[8] = p[9]= 0;
4596                 }
4597             }
4598         }
4599     }else if(IS_DIRECT(mb_type)){
4600         pred_direct_motion(h, &mb_type);
4601         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4602     }else{
4603         int list, mx, my, i;
4604          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4605         if(IS_16X16(mb_type)){
4606             for(list=0; list<h->list_count; list++){
4607                     unsigned int val;
4608                     if(IS_DIR(mb_type, 0, list)){
4609                         if(h->ref_count[list]==1){
4610                             val= 0;
4611                         }else if(h->ref_count[list]==2){
4612                             val= get_bits1(&s->gb)^1;
4613                         }else{
4614                             val= get_ue_golomb_31(&s->gb);
4615                             if(val >= h->ref_count[list]){
4616                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4617                                 return -1;
4618                             }
4619                         }
4620                     }else
4621                         val= LIST_NOT_USED&0xFF;
4622                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4623             }
4624             for(list=0; list<h->list_count; list++){
4625                 unsigned int val;
4626                 if(IS_DIR(mb_type, 0, list)){
4627                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4628                     mx += get_se_golomb(&s->gb);
4629                     my += get_se_golomb(&s->gb);
4630                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4631
4632                     val= pack16to32(mx,my);
4633                 }else
4634                     val=0;
4635                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4636             }
4637         }
4638         else if(IS_16X8(mb_type)){
4639             for(list=0; list<h->list_count; list++){
4640                     for(i=0; i<2; i++){
4641                         unsigned int val;
4642                         if(IS_DIR(mb_type, i, list)){
4643                             if(h->ref_count[list] == 1){
4644                                 val= 0;
4645                             }else if(h->ref_count[list] == 2){
4646                                 val= get_bits1(&s->gb)^1;
4647                             }else{
4648                                 val= get_ue_golomb_31(&s->gb);
4649                                 if(val >= h->ref_count[list]){
4650                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4651                                     return -1;
4652                                 }
4653                             }
4654                         }else
4655                             val= LIST_NOT_USED&0xFF;
4656                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4657                     }
4658             }
4659             for(list=0; list<h->list_count; list++){
4660                 for(i=0; i<2; i++){
4661                     unsigned int val;
4662                     if(IS_DIR(mb_type, i, list)){
4663                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4664                         mx += get_se_golomb(&s->gb);
4665                         my += get_se_golomb(&s->gb);
4666                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4667
4668                         val= pack16to32(mx,my);
4669                     }else
4670                         val=0;
4671                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4672                 }
4673             }
4674         }else{
4675             assert(IS_8X16(mb_type));
4676             for(list=0; list<h->list_count; list++){
4677                     for(i=0; i<2; i++){
4678                         unsigned int val;
4679                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4680                             if(h->ref_count[list]==1){
4681                                 val= 0;
4682                             }else if(h->ref_count[list]==2){
4683                                 val= get_bits1(&s->gb)^1;
4684                             }else{
4685                                 val= get_ue_golomb_31(&s->gb);
4686                                 if(val >= h->ref_count[list]){
4687                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4688                                     return -1;
4689                                 }
4690                             }
4691                         }else
4692                             val= LIST_NOT_USED&0xFF;
4693                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4694                     }
4695             }
4696             for(list=0; list<h->list_count; list++){
4697                 for(i=0; i<2; i++){
4698                     unsigned int val;
4699                     if(IS_DIR(mb_type, i, list)){
4700                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4701                         mx += get_se_golomb(&s->gb);
4702                         my += get_se_golomb(&s->gb);
4703                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4704
4705                         val= pack16to32(mx,my);
4706                     }else
4707                         val=0;
4708                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4709                 }
4710             }
4711         }
4712     }
4713
4714     if(IS_INTER(mb_type))
4715         write_back_motion(h, mb_type);
4716
4717     if(!IS_INTRA16x16(mb_type)){
4718         cbp= get_ue_golomb(&s->gb);
4719         if(cbp > 47){
4720             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4721             return -1;
4722         }
4723
4724         if(CHROMA){
4725             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4726             else                     cbp= golomb_to_inter_cbp   [cbp];
4727         }else{
4728             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4729             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4730         }
4731     }
4732     h->cbp = cbp;
4733
4734     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4735         if(get_bits1(&s->gb)){
4736             mb_type |= MB_TYPE_8x8DCT;
4737             h->cbp_table[mb_xy]= cbp;
4738         }
4739     }
4740     s->current_picture.mb_type[mb_xy]= mb_type;
4741
4742     if(cbp || IS_INTRA16x16(mb_type)){
4743         int i8x8, i4x4, chroma_idx;
4744         int dquant;
4745         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4746         const uint8_t *scan, *scan8x8, *dc_scan;
4747
4748 //        fill_non_zero_count_cache(h);
4749
4750         if(IS_INTERLACED(mb_type)){
4751             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4752             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4753             dc_scan= luma_dc_field_scan;
4754         }else{
4755             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4756             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4757             dc_scan= luma_dc_zigzag_scan;
4758         }
4759
4760         dquant= get_se_golomb(&s->gb);
4761
4762         if( dquant > 25 || dquant < -26 ){
4763             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4764             return -1;
4765         }
4766
4767         s->qscale += dquant;
4768         if(((unsigned)s->qscale) > 51){
4769             if(s->qscale<0) s->qscale+= 52;
4770             else            s->qscale-= 52;
4771         }
4772
4773         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4774         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4775         if(IS_INTRA16x16(mb_type)){
4776             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4777                 return -1; //FIXME continue if partitioned and other return -1 too
4778             }
4779
4780             assert((cbp&15) == 0 || (cbp&15) == 15);
4781
4782             if(cbp&15){
4783                 for(i8x8=0; i8x8<4; i8x8++){
4784                     for(i4x4=0; i4x4<4; i4x4++){
4785                         const int index= i4x4 + 4*i8x8;
4786                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4787                             return -1;
4788                         }
4789                     }
4790                 }
4791             }else{
4792                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4793             }
4794         }else{
4795             for(i8x8=0; i8x8<4; i8x8++){
4796                 if(cbp & (1<<i8x8)){
4797                     if(IS_8x8DCT(mb_type)){
4798                         DCTELEM *buf = &h->mb[64*i8x8];
4799                         uint8_t *nnz;
4800                         for(i4x4=0; i4x4<4; i4x4++){
4801                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4802                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4803                                 return -1;
4804                         }
4805                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4806                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4807                     }else{
4808                         for(i4x4=0; i4x4<4; i4x4++){
4809                             const int index= i4x4 + 4*i8x8;
4810
4811                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4812                                 return -1;
4813                             }
4814                         }
4815                     }
4816                 }else{
4817                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4818                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4819                 }
4820             }
4821         }
4822
4823         if(cbp&0x30){
4824             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4825                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4826                     return -1;
4827                 }
4828         }
4829
4830         if(cbp&0x20){
4831             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4832                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4833                 for(i4x4=0; i4x4<4; i4x4++){
4834                     const int index= 16 + 4*chroma_idx + i4x4;
4835                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4836                         return -1;
4837                     }
4838                 }
4839             }
4840         }else{
4841             uint8_t * const nnz= &h->non_zero_count_cache[0];
4842             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4843             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4844         }
4845     }else{
4846         uint8_t * const nnz= &h->non_zero_count_cache[0];
4847         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4848         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4849         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4850     }
4851     s->current_picture.qscale_table[mb_xy]= s->qscale;
4852     write_back_non_zero_count(h);
4853
4854     if(MB_MBAFF){
4855         h->ref_count[0] >>= 1;
4856         h->ref_count[1] >>= 1;
4857     }
4858
4859     return 0;
4860 }
4861
4862 static int decode_cabac_field_decoding_flag(H264Context *h) {
4863     MpegEncContext * const s = &h->s;
4864     const int mb_x = s->mb_x;
4865     const int mb_y = s->mb_y & ~1;
4866     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4867     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4868
4869     unsigned int ctx = 0;
4870
4871     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4872         ctx += 1;
4873     }
4874     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4875         ctx += 1;
4876     }
4877
4878     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4879 }
4880
4881 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4882     uint8_t *state= &h->cabac_state[ctx_base];
4883     int mb_type;
4884
4885     if(intra_slice){
4886         MpegEncContext * const s = &h->s;
4887         const int mba_xy = h->left_mb_xy[0];
4888         const int mbb_xy = h->top_mb_xy;
4889         int ctx=0;
4890         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4891             ctx++;
4892         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4893             ctx++;
4894         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4895             return 0;   /* I4x4 */
4896         state += 2;
4897     }else{
4898         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4899             return 0;   /* I4x4 */
4900     }
4901
4902     if( get_cabac_terminate( &h->cabac ) )
4903         return 25;  /* PCM */
4904
4905     mb_type = 1; /* I16x16 */
4906     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4907     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4908         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4909     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4910     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4911     return mb_type;
4912 }
4913
4914 static int decode_cabac_mb_type_b( H264Context *h ) {
4915     MpegEncContext * const s = &h->s;
4916
4917         const int mba_xy = h->left_mb_xy[0];
4918         const int mbb_xy = h->top_mb_xy;
4919         int ctx = 0;
4920         int bits;
4921         assert(h->slice_type_nos == FF_B_TYPE);
4922
4923         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4924             ctx++;
4925         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4926             ctx++;
4927
4928         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4929             return 0; /* B_Direct_16x16 */
4930
4931         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4932             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4933         }
4934
4935         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4936         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4937         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4938         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4939         if( bits < 8 )
4940             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4941         else if( bits == 13 ) {
4942             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4943         } else if( bits == 14 )
4944             return 11; /* B_L1_L0_8x16 */
4945         else if( bits == 15 )
4946             return 22; /* B_8x8 */
4947
4948         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4949         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4950 }
4951
4952 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4953     MpegEncContext * const s = &h->s;
4954     int mba_xy, mbb_xy;
4955     int ctx = 0;
4956
4957     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4958         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4959         mba_xy = mb_xy - 1;
4960         if( (mb_y&1)
4961             && h->slice_table[mba_xy] == h->slice_num
4962             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4963             mba_xy += s->mb_stride;
4964         if( MB_FIELD ){
4965             mbb_xy = mb_xy - s->mb_stride;
4966             if( !(mb_y&1)
4967                 && h->slice_table[mbb_xy] == h->slice_num
4968                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4969                 mbb_xy -= s->mb_stride;
4970         }else
4971             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4972     }else{
4973         int mb_xy = h->mb_xy;
4974         mba_xy = mb_xy - 1;
4975         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4976     }
4977
4978     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4979         ctx++;
4980     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4981         ctx++;
4982
4983     if( h->slice_type_nos == FF_B_TYPE )
4984         ctx += 13;
4985     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4986 }
4987
4988 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4989     int mode = 0;
4990
4991     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4992         return pred_mode;
4993
4994     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4995     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4996     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4997
4998     if( mode >= pred_mode )
4999         return mode + 1;
5000     else
5001         return mode;
5002 }
5003
5004 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5005     const int mba_xy = h->left_mb_xy[0];
5006     const int mbb_xy = h->top_mb_xy;
5007
5008     int ctx = 0;
5009
5010     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5011     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5012         ctx++;
5013
5014     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5015         ctx++;
5016
5017     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5018         return 0;
5019
5020     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5021         return 1;
5022     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5023         return 2;
5024     else
5025         return 3;
5026 }
5027
5028 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5029     int cbp_b, cbp_a, ctx, cbp = 0;
5030
5031     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5032     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5033
5034     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5035     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5036     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5037     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5038     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5039     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5040     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5041     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5042     return cbp;
5043 }
5044 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5045     int ctx;
5046     int cbp_a, cbp_b;
5047
5048     cbp_a = (h->left_cbp>>4)&0x03;
5049     cbp_b = (h-> top_cbp>>4)&0x03;
5050
5051     ctx = 0;
5052     if( cbp_a > 0 ) ctx++;
5053     if( cbp_b > 0 ) ctx += 2;
5054     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5055         return 0;
5056
5057     ctx = 4;
5058     if( cbp_a == 2 ) ctx++;
5059     if( cbp_b == 2 ) ctx += 2;
5060     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5061 }
5062 static int decode_cabac_mb_dqp( H264Context *h) {
5063     int   ctx= h->last_qscale_diff != 0;
5064     int   val = 0;
5065
5066     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5067         ctx= 2+(ctx>>1);
5068         val++;
5069         if(val > 102) //prevent infinite loop
5070             return INT_MIN;
5071     }
5072
5073     if( val&0x01 )
5074         return   (val + 1)>>1 ;
5075     else
5076         return -((val + 1)>>1);
5077 }
5078 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5079     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5080         return 0;   /* 8x8 */
5081     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5082         return 1;   /* 8x4 */
5083     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5084         return 2;   /* 4x8 */
5085     return 3;       /* 4x4 */
5086 }
5087 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5088     int type;
5089     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5090         return 0;   /* B_Direct_8x8 */
5091     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5092         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5093     type = 3;
5094     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5095         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5096             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5097         type += 4;
5098     }
5099     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5100     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5101     return type;
5102 }
5103
5104 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5105     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5106 }
5107
5108 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5109     int refa = h->ref_cache[list][scan8[n] - 1];
5110     int refb = h->ref_cache[list][scan8[n] - 8];
5111     int ref  = 0;
5112     int ctx  = 0;
5113
5114     if( h->slice_type_nos == FF_B_TYPE) {
5115         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5116             ctx++;
5117         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5118             ctx += 2;
5119     } else {
5120         if( refa > 0 )
5121             ctx++;
5122         if( refb > 0 )
5123             ctx += 2;
5124     }
5125
5126     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5127         ref++;
5128         ctx = (ctx>>2)+4;
5129         if(ref >= 32 /*h->ref_list[list]*/){
5130             return -1;
5131         }
5132     }
5133     return ref;
5134 }
5135
5136 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5137     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5138                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5139     int ctxbase = (l == 0) ? 40 : 47;
5140     int mvd;
5141     int ctx = (amvd>2) + (amvd>32);
5142
5143     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5144         return 0;
5145
5146     mvd= 1;
5147     ctx= 3;
5148     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5149         mvd++;
5150         if( ctx < 6 )
5151             ctx++;
5152     }
5153
5154     if( mvd >= 9 ) {
5155         int k = 3;
5156         while( get_cabac_bypass( &h->cabac ) ) {
5157             mvd += 1 << k;
5158             k++;
5159             if(k>24){
5160                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5161                 return INT_MIN;
5162             }
5163         }
5164         while( k-- ) {
5165             if( get_cabac_bypass( &h->cabac ) )
5166                 mvd += 1 << k;
5167         }
5168     }
5169     return get_cabac_bypass_sign( &h->cabac, -mvd );
5170 }
5171
5172 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5173     int nza, nzb;
5174     int ctx = 0;
5175
5176     if( is_dc ) {
5177         if( cat == 0 ) {
5178             nza = h->left_cbp&0x100;
5179             nzb = h-> top_cbp&0x100;
5180         } else {
5181             nza = (h->left_cbp>>(6+idx))&0x01;
5182             nzb = (h-> top_cbp>>(6+idx))&0x01;
5183         }
5184     } else {
5185         assert(cat == 1 || cat == 2 || cat == 4);
5186         nza = h->non_zero_count_cache[scan8[idx] - 1];
5187         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5188     }
5189
5190     if( nza > 0 )
5191         ctx++;
5192
5193     if( nzb > 0 )
5194         ctx += 2;
5195
5196     return ctx + 4 * cat;
5197 }
5198
5199 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5200     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5201     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5202     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5203     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5204 };
5205
5206 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5207     static const int significant_coeff_flag_offset[2][6] = {
5208       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5209       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5210     };
5211     static const int last_coeff_flag_offset[2][6] = {
5212       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5213       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5214     };
5215     static const int coeff_abs_level_m1_offset[6] = {
5216         227+0, 227+10, 227+20, 227+30, 227+39, 426
5217     };
5218     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5219       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5220         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5221         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5222        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5223       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5224         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5225         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5226         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5227     };
5228     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5229      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5230      * map node ctx => cabac ctx for level=1 */
5231     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5232     /* map node ctx => cabac ctx for level>1 */
5233     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5234     static const uint8_t coeff_abs_level_transition[2][8] = {
5235     /* update node ctx after decoding a level=1 */
5236         { 1, 2, 3, 3, 4, 5, 6, 7 },
5237     /* update node ctx after decoding a level>1 */
5238         { 4, 4, 4, 4, 5, 6, 7, 7 }
5239     };
5240
5241     int index[64];
5242
5243     int av_unused last;
5244     int coeff_count = 0;
5245     int node_ctx = 0;
5246
5247     uint8_t *significant_coeff_ctx_base;
5248     uint8_t *last_coeff_ctx_base;
5249     uint8_t *abs_level_m1_ctx_base;
5250
5251 #if !ARCH_X86
5252 #define CABAC_ON_STACK
5253 #endif
5254 #ifdef CABAC_ON_STACK
5255 #define CC &cc
5256     CABACContext cc;
5257     cc.range     = h->cabac.range;
5258     cc.low       = h->cabac.low;
5259     cc.bytestream= h->cabac.bytestream;
5260 #else
5261 #define CC &h->cabac
5262 #endif
5263
5264
5265     /* cat: 0-> DC 16x16  n = 0
5266      *      1-> AC 16x16  n = luma4x4idx
5267      *      2-> Luma4x4   n = luma4x4idx
5268      *      3-> DC Chroma n = iCbCr
5269      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5270      *      5-> Luma8x8   n = 4 * luma8x8idx
5271      */
5272
5273     /* read coded block flag */
5274     if( is_dc || cat != 5 ) {
5275         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5276             if( !is_dc )
5277                 h->non_zero_count_cache[scan8[n]] = 0;
5278
5279 #ifdef CABAC_ON_STACK
5280             h->cabac.range     = cc.range     ;
5281             h->cabac.low       = cc.low       ;
5282             h->cabac.bytestream= cc.bytestream;
5283 #endif
5284             return;
5285         }
5286     }
5287
5288     significant_coeff_ctx_base = h->cabac_state
5289         + significant_coeff_flag_offset[MB_FIELD][cat];
5290     last_coeff_ctx_base = h->cabac_state
5291         + last_coeff_flag_offset[MB_FIELD][cat];
5292     abs_level_m1_ctx_base = h->cabac_state
5293         + coeff_abs_level_m1_offset[cat];
5294
5295     if( !is_dc && cat == 5 ) {
5296 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5297         for(last= 0; last < coefs; last++) { \
5298             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5299             if( get_cabac( CC, sig_ctx )) { \
5300                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5301                 index[coeff_count++] = last; \
5302                 if( get_cabac( CC, last_ctx ) ) { \
5303                     last= max_coeff; \
5304                     break; \
5305                 } \
5306             } \
5307         }\
5308         if( last == max_coeff -1 ) {\
5309             index[coeff_count++] = last;\
5310         }
5311         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5312 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5313         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5314     } else {
5315         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5316 #else
5317         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5318     } else {
5319         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5320 #endif
5321     }
5322     assert(coeff_count > 0);
5323
5324     if( is_dc ) {
5325         if( cat == 0 )
5326             h->cbp_table[h->mb_xy] |= 0x100;
5327         else
5328             h->cbp_table[h->mb_xy] |= 0x40 << n;
5329     } else {
5330         if( cat == 5 )
5331             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5332         else {
5333             assert( cat == 1 || cat == 2 || cat == 4 );
5334             h->non_zero_count_cache[scan8[n]] = coeff_count;
5335         }
5336     }
5337
5338     do {
5339         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5340
5341         int j= scantable[index[--coeff_count]];
5342
5343         if( get_cabac( CC, ctx ) == 0 ) {
5344             node_ctx = coeff_abs_level_transition[0][node_ctx];
5345             if( is_dc ) {
5346                 block[j] = get_cabac_bypass_sign( CC, -1);
5347             }else{
5348                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5349             }
5350         } else {
5351             int coeff_abs = 2;
5352             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5353             node_ctx = coeff_abs_level_transition[1][node_ctx];
5354
5355             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5356                 coeff_abs++;
5357             }
5358
5359             if( coeff_abs >= 15 ) {
5360                 int j = 0;
5361                 while( get_cabac_bypass( CC ) ) {
5362                     j++;
5363                 }
5364
5365                 coeff_abs=1;
5366                 while( j-- ) {
5367                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5368                 }
5369                 coeff_abs+= 14;
5370             }
5371
5372             if( is_dc ) {
5373                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5374             }else{
5375                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5376             }
5377         }
5378     } while( coeff_count );
5379 #ifdef CABAC_ON_STACK
5380             h->cabac.range     = cc.range     ;
5381             h->cabac.low       = cc.low       ;
5382             h->cabac.bytestream= cc.bytestream;
5383 #endif
5384
5385 }
5386
5387 #if !CONFIG_SMALL
5388 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5389     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5390 }
5391
5392 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5393     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5394 }
5395 #endif
5396
5397 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5398 #if CONFIG_SMALL
5399     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5400 #else
5401     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5402     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5403 #endif
5404 }
5405
5406 static inline void compute_mb_neighbors(H264Context *h)
5407 {
5408     MpegEncContext * const s = &h->s;
5409     const int mb_xy  = h->mb_xy;
5410     h->top_mb_xy     = mb_xy - s->mb_stride;
5411     h->left_mb_xy[0] = mb_xy - 1;
5412     if(FRAME_MBAFF){
5413         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5414         const int top_pair_xy      = pair_xy     - s->mb_stride;
5415         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5416         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5417         const int curr_mb_field_flag = MB_FIELD;
5418         const int bottom = (s->mb_y & 1);
5419
5420         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5421             h->top_mb_xy -= s->mb_stride;
5422         }
5423         if (!left_mb_field_flag == curr_mb_field_flag) {
5424             h->left_mb_xy[0] = pair_xy - 1;
5425         }
5426     } else if (FIELD_PICTURE) {
5427         h->top_mb_xy -= s->mb_stride;
5428     }
5429     return;
5430 }
5431
5432 /**
5433  * decodes a macroblock
5434  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5435  */
5436 static int decode_mb_cabac(H264Context *h) {
5437     MpegEncContext * const s = &h->s;
5438     int mb_xy;
5439     int mb_type, partition_count, cbp = 0;
5440     int dct8x8_allowed= h->pps.transform_8x8_mode;
5441
5442     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5443
5444     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5445     if( h->slice_type_nos != FF_I_TYPE ) {
5446         int skip;
5447         /* a skipped mb needs the aff flag from the following mb */
5448         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5449             predict_field_decoding_flag(h);
5450         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5451             skip = h->next_mb_skipped;
5452         else
5453             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5454         /* read skip flags */
5455         if( skip ) {
5456             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5457                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5458                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5459                 if(!h->next_mb_skipped)
5460                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5461             }
5462
5463             decode_mb_skip(h);
5464
5465             h->cbp_table[mb_xy] = 0;
5466             h->chroma_pred_mode_table[mb_xy] = 0;
5467             h->last_qscale_diff = 0;
5468
5469             return 0;
5470
5471         }
5472     }
5473     if(FRAME_MBAFF){
5474         if( (s->mb_y&1) == 0 )
5475             h->mb_mbaff =
5476             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5477     }
5478
5479     h->prev_mb_skipped = 0;
5480
5481     compute_mb_neighbors(h);
5482
5483     if( h->slice_type_nos == FF_B_TYPE ) {
5484         mb_type = decode_cabac_mb_type_b( h );
5485         if( mb_type < 23 ){
5486             partition_count= b_mb_type_info[mb_type].partition_count;
5487             mb_type=         b_mb_type_info[mb_type].type;
5488         }else{
5489             mb_type -= 23;
5490             goto decode_intra_mb;
5491         }
5492     } else if( h->slice_type_nos == FF_P_TYPE ) {
5493         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5494             /* P-type */
5495             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5496                 /* P_L0_D16x16, P_8x8 */
5497                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5498             } else {
5499                 /* P_L0_D8x16, P_L0_D16x8 */
5500                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5501             }
5502             partition_count= p_mb_type_info[mb_type].partition_count;
5503             mb_type=         p_mb_type_info[mb_type].type;
5504         } else {
5505             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5506             goto decode_intra_mb;
5507         }
5508     } else {
5509         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5510         if(h->slice_type == FF_SI_TYPE && mb_type)
5511             mb_type--;
5512         assert(h->slice_type_nos == FF_I_TYPE);
5513 decode_intra_mb:
5514         partition_count = 0;
5515         cbp= i_mb_type_info[mb_type].cbp;
5516         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5517         mb_type= i_mb_type_info[mb_type].type;
5518     }
5519     if(MB_FIELD)
5520         mb_type |= MB_TYPE_INTERLACED;
5521
5522     h->slice_table[ mb_xy ]= h->slice_num;
5523
5524     if(IS_INTRA_PCM(mb_type)) {
5525         const uint8_t *ptr;
5526
5527         // We assume these blocks are very rare so we do not optimize it.
5528         // FIXME The two following lines get the bitstream position in the cabac
5529         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5530         ptr= h->cabac.bytestream;
5531         if(h->cabac.low&0x1) ptr--;
5532         if(CABAC_BITS==16){
5533             if(h->cabac.low&0x1FF) ptr--;
5534         }
5535
5536         // The pixels are stored in the same order as levels in h->mb array.
5537         memcpy(h->mb, ptr, 256); ptr+=256;
5538         if(CHROMA){
5539             memcpy(h->mb+128, ptr, 128); ptr+=128;
5540         }
5541
5542         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5543
5544         // All blocks are present
5545         h->cbp_table[mb_xy] = 0x1ef;
5546         h->chroma_pred_mode_table[mb_xy] = 0;
5547         // In deblocking, the quantizer is 0
5548         s->current_picture.qscale_table[mb_xy]= 0;
5549         // All coeffs are present
5550         memset(h->non_zero_count[mb_xy], 16, 16);
5551         s->current_picture.mb_type[mb_xy]= mb_type;
5552         h->last_qscale_diff = 0;
5553         return 0;
5554     }
5555
5556     if(MB_MBAFF){
5557         h->ref_count[0] <<= 1;
5558         h->ref_count[1] <<= 1;
5559     }
5560
5561     fill_caches(h, mb_type, 0);
5562
5563     if( IS_INTRA( mb_type ) ) {
5564         int i, pred_mode;
5565         if( IS_INTRA4x4( mb_type ) ) {
5566             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5567                 mb_type |= MB_TYPE_8x8DCT;
5568                 for( i = 0; i < 16; i+=4 ) {
5569                     int pred = pred_intra_mode( h, i );
5570                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5571                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5572                 }
5573             } else {
5574                 for( i = 0; i < 16; i++ ) {
5575                     int pred = pred_intra_mode( h, i );
5576                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5577
5578                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5579                 }
5580             }
5581             write_back_intra_pred_mode(h);
5582             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5583         } else {
5584             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5585             if( h->intra16x16_pred_mode < 0 ) return -1;
5586         }
5587         if(CHROMA){
5588             h->chroma_pred_mode_table[mb_xy] =
5589             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5590
5591             pred_mode= check_intra_pred_mode( h, pred_mode );
5592             if( pred_mode < 0 ) return -1;
5593             h->chroma_pred_mode= pred_mode;
5594         }
5595     } else if( partition_count == 4 ) {
5596         int i, j, sub_partition_count[4], list, ref[2][4];
5597
5598         if( h->slice_type_nos == FF_B_TYPE ) {
5599             for( i = 0; i < 4; i++ ) {
5600                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5601                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5602                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5603             }
5604             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5605                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5606                 pred_direct_motion(h, &mb_type);
5607                 h->ref_cache[0][scan8[4]] =
5608                 h->ref_cache[1][scan8[4]] =
5609                 h->ref_cache[0][scan8[12]] =
5610                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5611                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5612                     for( i = 0; i < 4; i++ )
5613                         if( IS_DIRECT(h->sub_mb_type[i]) )
5614                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5615                 }
5616             }
5617         } else {
5618             for( i = 0; i < 4; i++ ) {
5619                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5620                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5621                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5622             }
5623         }
5624
5625         for( list = 0; list < h->list_count; list++ ) {
5626                 for( i = 0; i < 4; i++ ) {
5627                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5628                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5629                         if( h->ref_count[list] > 1 ){
5630                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5631                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5632                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5633                                 return -1;
5634                             }
5635                         }else
5636                             ref[list][i] = 0;
5637                     } else {
5638                         ref[list][i] = -1;
5639                     }
5640                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5641                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5642                 }
5643         }
5644
5645         if(dct8x8_allowed)
5646             dct8x8_allowed = get_dct8x8_allowed(h);
5647
5648         for(list=0; list<h->list_count; list++){
5649             for(i=0; i<4; i++){
5650                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5651                 if(IS_DIRECT(h->sub_mb_type[i])){
5652                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5653                     continue;
5654                 }
5655
5656                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5657                     const int sub_mb_type= h->sub_mb_type[i];
5658                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5659                     for(j=0; j<sub_partition_count[i]; j++){
5660                         int mpx, mpy;
5661                         int mx, my;
5662                         const int index= 4*i + block_width*j;
5663                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5664                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5665                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5666
5667                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5668                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5669                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5670
5671                         if(IS_SUB_8X8(sub_mb_type)){
5672                             mv_cache[ 1 ][0]=
5673                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5674                             mv_cache[ 1 ][1]=
5675                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5676
5677                             mvd_cache[ 1 ][0]=
5678                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5679                             mvd_cache[ 1 ][1]=
5680                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5681                         }else if(IS_SUB_8X4(sub_mb_type)){
5682                             mv_cache[ 1 ][0]= mx;
5683                             mv_cache[ 1 ][1]= my;
5684
5685                             mvd_cache[ 1 ][0]= mx - mpx;
5686                             mvd_cache[ 1 ][1]= my - mpy;
5687                         }else if(IS_SUB_4X8(sub_mb_type)){
5688                             mv_cache[ 8 ][0]= mx;
5689                             mv_cache[ 8 ][1]= my;
5690
5691                             mvd_cache[ 8 ][0]= mx - mpx;
5692                             mvd_cache[ 8 ][1]= my - mpy;
5693                         }
5694                         mv_cache[ 0 ][0]= mx;
5695                         mv_cache[ 0 ][1]= my;
5696
5697                         mvd_cache[ 0 ][0]= mx - mpx;
5698                         mvd_cache[ 0 ][1]= my - mpy;
5699                     }
5700                 }else{
5701                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5702                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5703                     p[0] = p[1] = p[8] = p[9] = 0;
5704                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5705                 }
5706             }
5707         }
5708     } else if( IS_DIRECT(mb_type) ) {
5709         pred_direct_motion(h, &mb_type);
5710         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5711         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5712         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5713     } else {
5714         int list, mx, my, i, mpx, mpy;
5715         if(IS_16X16(mb_type)){
5716             for(list=0; list<h->list_count; list++){
5717                 if(IS_DIR(mb_type, 0, list)){
5718                     int ref;
5719                     if(h->ref_count[list] > 1){
5720                         ref= decode_cabac_mb_ref(h, list, 0);
5721                         if(ref >= (unsigned)h->ref_count[list]){
5722                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5723                             return -1;
5724                         }
5725                     }else
5726                         ref=0;
5727                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5728                 }else
5729                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5730             }
5731             for(list=0; list<h->list_count; list++){
5732                 if(IS_DIR(mb_type, 0, list)){
5733                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5734
5735                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5736                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5737                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5738
5739                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5740                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5741                 }else
5742                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5743             }
5744         }
5745         else if(IS_16X8(mb_type)){
5746             for(list=0; list<h->list_count; list++){
5747                     for(i=0; i<2; i++){
5748                         if(IS_DIR(mb_type, i, list)){
5749                             int ref;
5750                             if(h->ref_count[list] > 1){
5751                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5752                                 if(ref >= (unsigned)h->ref_count[list]){
5753                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5754                                     return -1;
5755                                 }
5756                             }else
5757                                 ref=0;
5758                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5759                         }else
5760                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5761                     }
5762             }
5763             for(list=0; list<h->list_count; list++){
5764                 for(i=0; i<2; i++){
5765                     if(IS_DIR(mb_type, i, list)){
5766                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5767                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5768                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5769                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5770
5771                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5772                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5773                     }else{
5774                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5775                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5776                     }
5777                 }
5778             }
5779         }else{
5780             assert(IS_8X16(mb_type));
5781             for(list=0; list<h->list_count; list++){
5782                     for(i=0; i<2; i++){
5783                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5784                             int ref;
5785                             if(h->ref_count[list] > 1){
5786                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5787                                 if(ref >= (unsigned)h->ref_count[list]){
5788                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5789                                     return -1;
5790                                 }
5791                             }else
5792                                 ref=0;
5793                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5794                         }else
5795                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5796                     }
5797             }
5798             for(list=0; list<h->list_count; list++){
5799                 for(i=0; i<2; i++){
5800                     if(IS_DIR(mb_type, i, list)){
5801                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5802                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5803                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5804
5805                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5806                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5807                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5808                     }else{
5809                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5810                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5811                     }
5812                 }
5813             }
5814         }
5815     }
5816
5817    if( IS_INTER( mb_type ) ) {
5818         h->chroma_pred_mode_table[mb_xy] = 0;
5819         write_back_motion( h, mb_type );
5820    }
5821
5822     if( !IS_INTRA16x16( mb_type ) ) {
5823         cbp  = decode_cabac_mb_cbp_luma( h );
5824         if(CHROMA)
5825             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5826     }
5827
5828     h->cbp_table[mb_xy] = h->cbp = cbp;
5829
5830     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5831         if( decode_cabac_mb_transform_size( h ) )
5832             mb_type |= MB_TYPE_8x8DCT;
5833     }
5834     s->current_picture.mb_type[mb_xy]= mb_type;
5835
5836     if( cbp || IS_INTRA16x16( mb_type ) ) {
5837         const uint8_t *scan, *scan8x8, *dc_scan;
5838         const uint32_t *qmul;
5839         int dqp;
5840
5841         if(IS_INTERLACED(mb_type)){
5842             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5843             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5844             dc_scan= luma_dc_field_scan;
5845         }else{
5846             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5847             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5848             dc_scan= luma_dc_zigzag_scan;
5849         }
5850
5851         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5852         if( dqp == INT_MIN ){
5853             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5854             return -1;
5855         }
5856         s->qscale += dqp;
5857         if(((unsigned)s->qscale) > 51){
5858             if(s->qscale<0) s->qscale+= 52;
5859             else            s->qscale-= 52;
5860         }
5861         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5862         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5863
5864         if( IS_INTRA16x16( mb_type ) ) {
5865             int i;
5866             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5867             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5868
5869             if( cbp&15 ) {
5870                 qmul = h->dequant4_coeff[0][s->qscale];
5871                 for( i = 0; i < 16; i++ ) {
5872                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5873                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5874                 }
5875             } else {
5876                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5877             }
5878         } else {
5879             int i8x8, i4x4;
5880             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5881                 if( cbp & (1<<i8x8) ) {
5882                     if( IS_8x8DCT(mb_type) ) {
5883                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5884                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5885                     } else {
5886                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5887                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5888                             const int index = 4*i8x8 + i4x4;
5889                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5890 //START_TIMER
5891                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5892 //STOP_TIMER("decode_residual")
5893                         }
5894                     }
5895                 } else {
5896                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5897                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5898                 }
5899             }
5900         }
5901
5902         if( cbp&0x30 ){
5903             int c;
5904             for( c = 0; c < 2; c++ ) {
5905                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5906                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5907             }
5908         }
5909
5910         if( cbp&0x20 ) {
5911             int c, i;
5912             for( c = 0; c < 2; c++ ) {
5913                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5914                 for( i = 0; i < 4; i++ ) {
5915                     const int index = 16 + 4 * c + i;
5916                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5917                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5918                 }
5919             }
5920         } else {
5921             uint8_t * const nnz= &h->non_zero_count_cache[0];
5922             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5923             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5924         }
5925     } else {
5926         uint8_t * const nnz= &h->non_zero_count_cache[0];
5927         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5928         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5929         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5930         h->last_qscale_diff = 0;
5931     }
5932
5933     s->current_picture.qscale_table[mb_xy]= s->qscale;
5934     write_back_non_zero_count(h);
5935
5936     if(MB_MBAFF){
5937         h->ref_count[0] >>= 1;
5938         h->ref_count[1] >>= 1;
5939     }
5940
5941     return 0;
5942 }
5943
5944
5945 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5946     const int index_a = qp + h->slice_alpha_c0_offset;
5947     const int alpha = (alpha_table+52)[index_a];
5948     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5949
5950     if( bS[0] < 4 ) {
5951         int8_t tc[4];
5952         tc[0] = (tc0_table+52)[index_a][bS[0]];
5953         tc[1] = (tc0_table+52)[index_a][bS[1]];
5954         tc[2] = (tc0_table+52)[index_a][bS[2]];
5955         tc[3] = (tc0_table+52)[index_a][bS[3]];
5956         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5957     } else {
5958         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5959     }
5960 }
5961 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5962     const int index_a = qp + h->slice_alpha_c0_offset;
5963     const int alpha = (alpha_table+52)[index_a];
5964     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5965
5966     if( bS[0] < 4 ) {
5967         int8_t tc[4];
5968         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5969         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5970         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5971         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5972         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5973     } else {
5974         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5975     }
5976 }
5977
5978 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5979     int i;
5980     for( i = 0; i < 16; i++, pix += stride) {
5981         int index_a;
5982         int alpha;
5983         int beta;
5984
5985         int qp_index;
5986         int bS_index = (i >> 1);
5987         if (!MB_FIELD) {
5988             bS_index &= ~1;
5989             bS_index |= (i & 1);
5990         }
5991
5992         if( bS[bS_index] == 0 ) {
5993             continue;
5994         }
5995
5996         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5997         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5998         alpha = (alpha_table+52)[index_a];
5999         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6000
6001         if( bS[bS_index] < 4 ) {
6002             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6003             const int p0 = pix[-1];
6004             const int p1 = pix[-2];
6005             const int p2 = pix[-3];
6006             const int q0 = pix[0];
6007             const int q1 = pix[1];
6008             const int q2 = pix[2];
6009
6010             if( FFABS( p0 - q0 ) < alpha &&
6011                 FFABS( p1 - p0 ) < beta &&
6012                 FFABS( q1 - q0 ) < beta ) {
6013                 int tc = tc0;
6014                 int i_delta;
6015
6016                 if( FFABS( p2 - p0 ) < beta ) {
6017                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6018                     tc++;
6019                 }
6020                 if( FFABS( q2 - q0 ) < beta ) {
6021                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6022                     tc++;
6023                 }
6024
6025                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6026                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6027                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6028                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6029             }
6030         }else{
6031             const int p0 = pix[-1];
6032             const int p1 = pix[-2];
6033             const int p2 = pix[-3];
6034
6035             const int q0 = pix[0];
6036             const int q1 = pix[1];
6037             const int q2 = pix[2];
6038
6039             if( FFABS( p0 - q0 ) < alpha &&
6040                 FFABS( p1 - p0 ) < beta &&
6041                 FFABS( q1 - q0 ) < beta ) {
6042
6043                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6044                     if( FFABS( p2 - p0 ) < beta)
6045                     {
6046                         const int p3 = pix[-4];
6047                         /* p0', p1', p2' */
6048                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6049                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6050                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6051                     } else {
6052                         /* p0' */
6053                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6054                     }
6055                     if( FFABS( q2 - q0 ) < beta)
6056                     {
6057                         const int q3 = pix[3];
6058                         /* q0', q1', q2' */
6059                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6060                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6061                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6062                     } else {
6063                         /* q0' */
6064                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6065                     }
6066                 }else{
6067                     /* p0', q0' */
6068                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6069                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6070                 }
6071                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6072             }
6073         }
6074     }
6075 }
6076 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6077     int i;
6078     for( i = 0; i < 8; i++, pix += stride) {
6079         int index_a;
6080         int alpha;
6081         int beta;
6082
6083         int qp_index;
6084         int bS_index = i;
6085
6086         if( bS[bS_index] == 0 ) {
6087             continue;
6088         }
6089
6090         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6091         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6092         alpha = (alpha_table+52)[index_a];
6093         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6094
6095         if( bS[bS_index] < 4 ) {
6096             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6097             const int p0 = pix[-1];
6098             const int p1 = pix[-2];
6099             const int q0 = pix[0];
6100             const int q1 = pix[1];
6101
6102             if( FFABS( p0 - q0 ) < alpha &&
6103                 FFABS( p1 - p0 ) < beta &&
6104                 FFABS( q1 - q0 ) < beta ) {
6105                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6106
6107                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6108                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6109                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6110             }
6111         }else{
6112             const int p0 = pix[-1];
6113             const int p1 = pix[-2];
6114             const int q0 = pix[0];
6115             const int q1 = pix[1];
6116
6117             if( FFABS( p0 - q0 ) < alpha &&
6118                 FFABS( p1 - p0 ) < beta &&
6119                 FFABS( q1 - q0 ) < beta ) {
6120
6121                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6122                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6123                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6124             }
6125         }
6126     }
6127 }
6128
6129 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6130     const int index_a = qp + h->slice_alpha_c0_offset;
6131     const int alpha = (alpha_table+52)[index_a];
6132     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6133
6134     if( bS[0] < 4 ) {
6135         int8_t tc[4];
6136         tc[0] = (tc0_table+52)[index_a][bS[0]];
6137         tc[1] = (tc0_table+52)[index_a][bS[1]];
6138         tc[2] = (tc0_table+52)[index_a][bS[2]];
6139         tc[3] = (tc0_table+52)[index_a][bS[3]];
6140         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6141     } else {
6142         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6143     }
6144 }
6145
6146 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6147     const int index_a = qp + h->slice_alpha_c0_offset;
6148     const int alpha = (alpha_table+52)[index_a];
6149     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6150
6151     if( bS[0] < 4 ) {
6152         int8_t tc[4];
6153         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6154         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6155         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6156         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6157         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6158     } else {
6159         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6160     }
6161 }
6162
6163 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6164     MpegEncContext * const s = &h->s;
6165     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6166     int mb_xy, mb_type;
6167     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6168
6169     mb_xy = h->mb_xy;
6170
6171     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6172         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6173        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6174                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6175         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6176         return;
6177     }
6178     assert(!FRAME_MBAFF);
6179
6180     mb_type = s->current_picture.mb_type[mb_xy];
6181     qp = s->current_picture.qscale_table[mb_xy];
6182     qp0 = s->current_picture.qscale_table[mb_xy-1];
6183     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6184     qpc = get_chroma_qp( h, 0, qp );
6185     qpc0 = get_chroma_qp( h, 0, qp0 );
6186     qpc1 = get_chroma_qp( h, 0, qp1 );
6187     qp0 = (qp + qp0 + 1) >> 1;
6188     qp1 = (qp + qp1 + 1) >> 1;
6189     qpc0 = (qpc + qpc0 + 1) >> 1;
6190     qpc1 = (qpc + qpc1 + 1) >> 1;
6191     qp_thresh = 15 - h->slice_alpha_c0_offset;
6192     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6193        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6194         return;
6195
6196     if( IS_INTRA(mb_type) ) {
6197         int16_t bS4[4] = {4,4,4,4};
6198         int16_t bS3[4] = {3,3,3,3};
6199         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6200         if( IS_8x8DCT(mb_type) ) {
6201             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6202             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6203             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6204             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6205         } else {
6206             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6207             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6208             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6209             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6210             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6211             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6212             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6213             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6214         }
6215         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6216         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6217         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6218         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6219         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6220         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6221         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6222         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6223         return;
6224     } else {
6225         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6226         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6227         int edges;
6228         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6229             edges = 4;
6230             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6231         } else {
6232             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6233                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6234             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6235                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6236                              ? 3 : 0;
6237             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6238             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6239             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6240                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6241         }
6242         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6243             bSv[0][0] = 0x0004000400040004ULL;
6244         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6245             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6246
6247 #define FILTER(hv,dir,edge)\
6248         if(bSv[dir][edge]) {\
6249             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6250             if(!(edge&1)) {\
6251                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6252                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6253             }\
6254         }
6255         if( edges == 1 ) {
6256             FILTER(v,0,0);
6257             FILTER(h,1,0);
6258         } else if( IS_8x8DCT(mb_type) ) {
6259             FILTER(v,0,0);
6260             FILTER(v,0,2);
6261             FILTER(h,1,0);
6262             FILTER(h,1,2);
6263         } else {
6264             FILTER(v,0,0);
6265             FILTER(v,0,1);
6266             FILTER(v,0,2);
6267             FILTER(v,0,3);
6268             FILTER(h,1,0);
6269             FILTER(h,1,1);
6270             FILTER(h,1,2);
6271             FILTER(h,1,3);
6272         }
6273 #undef FILTER
6274     }
6275 }
6276
6277
6278 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6279     MpegEncContext * const s = &h->s;
6280     int edge;
6281     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6282     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6283     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6284     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6285     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6286
6287     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6288                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6289     // how often to recheck mv-based bS when iterating between edges
6290     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6291                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6292     // how often to recheck mv-based bS when iterating along each edge
6293     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6294
6295     if (first_vertical_edge_done) {
6296         start = 1;
6297     }
6298
6299     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6300         start = 1;
6301
6302     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6303         && !IS_INTERLACED(mb_type)
6304         && IS_INTERLACED(mbm_type)
6305         ) {
6306         // This is a special case in the norm where the filtering must
6307         // be done twice (one each of the field) even if we are in a
6308         // frame macroblock.
6309         //
6310         static const int nnz_idx[4] = {4,5,6,3};
6311         unsigned int tmp_linesize   = 2 *   linesize;
6312         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6313         int mbn_xy = mb_xy - 2 * s->mb_stride;
6314         int qp;
6315         int i, j;
6316         int16_t bS[4];
6317
6318         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6319             if( IS_INTRA(mb_type) ||
6320                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6321                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6322             } else {
6323                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6324                 for( i = 0; i < 4; i++ ) {
6325                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6326                         mbn_nnz[nnz_idx[i]] != 0 )
6327                         bS[i] = 2;
6328                     else
6329                         bS[i] = 1;
6330                 }
6331             }
6332             // Do not use s->qscale as luma quantizer because it has not the same
6333             // value in IPCM macroblocks.
6334             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6335             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6336             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6337             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6338             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6339                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6340             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6341                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6342         }
6343
6344         start = 1;
6345     }
6346
6347     /* Calculate bS */
6348     for( edge = start; edge < edges; edge++ ) {
6349         /* mbn_xy: neighbor macroblock */
6350         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6351         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6352         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6353         int16_t bS[4];
6354         int qp;
6355
6356         if( (edge&1) && IS_8x8DCT(mb_type) )
6357             continue;
6358
6359         if( IS_INTRA(mb_type) ||
6360             IS_INTRA(mbn_type) ) {
6361             int value;
6362             if (edge == 0) {
6363                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6364                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6365                 ) {
6366                     value = 4;
6367                 } else {
6368                     value = 3;
6369                 }
6370             } else {
6371                 value = 3;
6372             }
6373             bS[0] = bS[1] = bS[2] = bS[3] = value;
6374         } else {
6375             int i, l;
6376             int mv_done;
6377
6378             if( edge & mask_edge ) {
6379                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6380                 mv_done = 1;
6381             }
6382             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6383                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6384                 mv_done = 1;
6385             }
6386             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6387                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6388                 int bn_idx= b_idx - (dir ? 8:1);
6389                 int v = 0;
6390
6391                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6392                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6393                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6394                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6395                 }
6396
6397                 if(h->slice_type_nos == FF_B_TYPE && v){
6398                     v=0;
6399                     for( l = 0; !v && l < 2; l++ ) {
6400                         int ln= 1-l;
6401                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6402                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6403                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6404                     }
6405                 }
6406
6407                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6408                 mv_done = 1;
6409             }
6410             else
6411                 mv_done = 0;
6412
6413             for( i = 0; i < 4; i++ ) {
6414                 int x = dir == 0 ? edge : i;
6415                 int y = dir == 0 ? i    : edge;
6416                 int b_idx= 8 + 4 + x + 8*y;
6417                 int bn_idx= b_idx - (dir ? 8:1);
6418
6419                 if( h->non_zero_count_cache[b_idx] |
6420                     h->non_zero_count_cache[bn_idx] ) {
6421                     bS[i] = 2;
6422                 }
6423                 else if(!mv_done)
6424                 {
6425                     bS[i] = 0;
6426                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6427                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6428                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6429                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6430                             bS[i] = 1;
6431                             break;
6432                         }
6433                     }
6434
6435                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6436                         bS[i] = 0;
6437                         for( l = 0; l < 2; l++ ) {
6438                             int ln= 1-l;
6439                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6440                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6441                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6442                                 bS[i] = 1;
6443                                 break;
6444                             }
6445                         }
6446                     }
6447                 }
6448             }
6449
6450             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6451                 continue;
6452         }
6453
6454         /* Filter edge */
6455         // Do not use s->qscale as luma quantizer because it has not the same
6456         // value in IPCM macroblocks.
6457         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6458         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6459         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6460         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6461         if( dir == 0 ) {
6462             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6463             if( (edge&1) == 0 ) {
6464                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6465                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6466                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6467                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6468             }
6469         } else {
6470             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6471             if( (edge&1) == 0 ) {
6472                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6473                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6474                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6475                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6476             }
6477         }
6478     }
6479 }
6480
6481 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6482     MpegEncContext * const s = &h->s;
6483     const int mb_xy= mb_x + mb_y*s->mb_stride;
6484     const int mb_type = s->current_picture.mb_type[mb_xy];
6485     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6486     int first_vertical_edge_done = 0;
6487     av_unused int dir;
6488
6489     //for sufficiently low qp, filtering wouldn't do anything
6490     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6491     if(!FRAME_MBAFF){
6492         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6493         int qp = s->current_picture.qscale_table[mb_xy];
6494         if(qp <= qp_thresh
6495            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6496            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6497             return;
6498         }
6499     }
6500
6501     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6502     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6503         int top_type, left_type[2];
6504         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6505         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6506         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6507
6508         if(IS_8x8DCT(top_type)){
6509             h->non_zero_count_cache[4+8*0]=
6510             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6511             h->non_zero_count_cache[6+8*0]=
6512             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6513         }
6514         if(IS_8x8DCT(left_type[0])){
6515             h->non_zero_count_cache[3+8*1]=
6516             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6517         }
6518         if(IS_8x8DCT(left_type[1])){
6519             h->non_zero_count_cache[3+8*3]=
6520             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6521         }
6522
6523         if(IS_8x8DCT(mb_type)){
6524             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6525             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6526
6527             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6528             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6529
6530             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6531             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6532
6533             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6534             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6535         }
6536     }
6537
6538     if (FRAME_MBAFF
6539             // left mb is in picture
6540             && h->slice_table[mb_xy-1] != 0xFFFF
6541             // and current and left pair do not have the same interlaced type
6542             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6543             // and left mb is in the same slice if deblocking_filter == 2
6544             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6545         /* First vertical edge is different in MBAFF frames
6546          * There are 8 different bS to compute and 2 different Qp
6547          */
6548         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6549         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6550         int16_t bS[8];
6551         int qp[2];
6552         int bqp[2];
6553         int rqp[2];
6554         int mb_qp, mbn0_qp, mbn1_qp;
6555         int i;
6556         first_vertical_edge_done = 1;
6557
6558         if( IS_INTRA(mb_type) )
6559             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6560         else {
6561             for( i = 0; i < 8; i++ ) {
6562                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6563
6564                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6565                     bS[i] = 4;
6566                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6567                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6568                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6569                                                                        :
6570                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6571                     bS[i] = 2;
6572                 else
6573                     bS[i] = 1;
6574             }
6575         }
6576
6577         mb_qp = s->current_picture.qscale_table[mb_xy];
6578         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6579         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6580         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6581         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6582                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6583         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6584                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6585         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6586         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6587                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6588         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6589                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6590
6591         /* Filter edge */
6592         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6593         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6594         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6595         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6596         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6597     }
6598
6599 #if CONFIG_SMALL
6600     for( dir = 0; dir < 2; dir++ )
6601         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6602 #else
6603     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6604     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6605 #endif
6606 }
6607
6608 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6609     H264Context *h = *(void**)arg;
6610     MpegEncContext * const s = &h->s;
6611     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6612
6613     s->mb_skip_run= -1;
6614
6615     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6616                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6617
6618     if( h->pps.cabac ) {
6619         int i;
6620
6621         /* realign */
6622         align_get_bits( &s->gb );
6623
6624         /* init cabac */
6625         ff_init_cabac_states( &h->cabac);
6626         ff_init_cabac_decoder( &h->cabac,
6627                                s->gb.buffer + get_bits_count(&s->gb)/8,
6628                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6629         /* calculate pre-state */
6630         for( i= 0; i < 460; i++ ) {
6631             int pre;
6632             if( h->slice_type_nos == FF_I_TYPE )
6633                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6634             else
6635                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6636
6637             if( pre <= 63 )
6638                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6639             else
6640                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6641         }
6642
6643         for(;;){
6644 //START_TIMER
6645             int ret = decode_mb_cabac(h);
6646             int eos;
6647 //STOP_TIMER("decode_mb_cabac")
6648
6649             if(ret>=0) hl_decode_mb(h);
6650
6651             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6652                 s->mb_y++;
6653
6654                 ret = decode_mb_cabac(h);
6655
6656                 if(ret>=0) hl_decode_mb(h);
6657                 s->mb_y--;
6658             }
6659             eos = get_cabac_terminate( &h->cabac );
6660
6661             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6662                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6663                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6664                 return -1;
6665             }
6666
6667             if( ++s->mb_x >= s->mb_width ) {
6668                 s->mb_x = 0;
6669                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6670                 ++s->mb_y;
6671                 if(FIELD_OR_MBAFF_PICTURE) {
6672                     ++s->mb_y;
6673                 }
6674             }
6675
6676             if( eos || s->mb_y >= s->mb_height ) {
6677                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6678                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6679                 return 0;
6680             }
6681         }
6682
6683     } else {
6684         for(;;){
6685             int ret = decode_mb_cavlc(h);
6686
6687             if(ret>=0) hl_decode_mb(h);
6688
6689             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6690                 s->mb_y++;
6691                 ret = decode_mb_cavlc(h);
6692
6693                 if(ret>=0) hl_decode_mb(h);
6694                 s->mb_y--;
6695             }
6696
6697             if(ret<0){
6698                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6699                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6700
6701                 return -1;
6702             }
6703
6704             if(++s->mb_x >= s->mb_width){
6705                 s->mb_x=0;
6706                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6707                 ++s->mb_y;
6708                 if(FIELD_OR_MBAFF_PICTURE) {
6709                     ++s->mb_y;
6710                 }
6711                 if(s->mb_y >= s->mb_height){
6712                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6713
6714                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6715                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6716
6717                         return 0;
6718                     }else{
6719                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6720
6721                         return -1;
6722                     }
6723                 }
6724             }
6725
6726             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6727                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6728                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6729                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6730
6731                     return 0;
6732                 }else{
6733                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6734
6735                     return -1;
6736                 }
6737             }
6738         }
6739     }
6740
6741 #if 0
6742     for(;s->mb_y < s->mb_height; s->mb_y++){
6743         for(;s->mb_x < s->mb_width; s->mb_x++){
6744             int ret= decode_mb(h);
6745
6746             hl_decode_mb(h);
6747
6748             if(ret<0){
6749                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6750                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6751
6752                 return -1;
6753             }
6754
6755             if(++s->mb_x >= s->mb_width){
6756                 s->mb_x=0;
6757                 if(++s->mb_y >= s->mb_height){
6758                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6759                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6760
6761                         return 0;
6762                     }else{
6763                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6764
6765                         return -1;
6766                     }
6767                 }
6768             }
6769
6770             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6771                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6772                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6773
6774                     return 0;
6775                 }else{
6776                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6777
6778                     return -1;
6779                 }
6780             }
6781         }
6782         s->mb_x=0;
6783         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6784     }
6785 #endif
6786     return -1; //not reached
6787 }
6788
6789 static int decode_picture_timing(H264Context *h){
6790     MpegEncContext * const s = &h->s;
6791     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6792         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6793         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6794     }
6795     if(h->sps.pic_struct_present_flag){
6796         unsigned int i, num_clock_ts;
6797         h->sei_pic_struct = get_bits(&s->gb, 4);
6798
6799         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6800             return -1;
6801
6802         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6803
6804         for (i = 0 ; i < num_clock_ts ; i++){
6805             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6806                 unsigned int full_timestamp_flag;
6807                 skip_bits(&s->gb, 2);                 /* ct_type */
6808                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6809                 skip_bits(&s->gb, 5);                 /* counting_type */
6810                 full_timestamp_flag = get_bits(&s->gb, 1);
6811                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6812                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6813                 skip_bits(&s->gb, 8);                 /* n_frames */
6814                 if(full_timestamp_flag){
6815                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6816                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6817                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6818                 }else{
6819                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6820                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6821                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6822                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6823                             if(get_bits(&s->gb, 1))   /* hours_flag */
6824                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6825                         }
6826                     }
6827                 }
6828                 if(h->sps.time_offset_length > 0)
6829                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6830             }
6831         }
6832     }
6833     return 0;
6834 }
6835
6836 static int decode_unregistered_user_data(H264Context *h, int size){
6837     MpegEncContext * const s = &h->s;
6838     uint8_t user_data[16+256];
6839     int e, build, i;
6840
6841     if(size<16)
6842         return -1;
6843
6844     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6845         user_data[i]= get_bits(&s->gb, 8);
6846     }
6847
6848     user_data[i]= 0;
6849     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6850     if(e==1 && build>=0)
6851         h->x264_build= build;
6852
6853     if(s->avctx->debug & FF_DEBUG_BUGS)
6854         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6855
6856     for(; i<size; i++)
6857         skip_bits(&s->gb, 8);
6858
6859     return 0;
6860 }
6861
6862 static int decode_recovery_point(H264Context *h){
6863     MpegEncContext * const s = &h->s;
6864
6865     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6866     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6867
6868     return 0;
6869 }
6870
6871 static int decode_buffering_period(H264Context *h){
6872     MpegEncContext * const s = &h->s;
6873     unsigned int sps_id;
6874     int sched_sel_idx;
6875     SPS *sps;
6876
6877     sps_id = get_ue_golomb_31(&s->gb);
6878     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6879         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6880         return -1;
6881     }
6882     sps = h->sps_buffers[sps_id];
6883
6884     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6885     if (sps->nal_hrd_parameters_present_flag) {
6886         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6887             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6888             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6889         }
6890     }
6891     if (sps->vcl_hrd_parameters_present_flag) {
6892         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6893             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6894             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6895         }
6896     }
6897
6898     h->sei_buffering_period_present = 1;
6899     return 0;
6900 }
6901
6902 int ff_h264_decode_sei(H264Context *h){
6903     MpegEncContext * const s = &h->s;
6904
6905     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6906         int size, type;
6907
6908         type=0;
6909         do{
6910             type+= show_bits(&s->gb, 8);
6911         }while(get_bits(&s->gb, 8) == 255);
6912
6913         size=0;
6914         do{
6915             size+= show_bits(&s->gb, 8);
6916         }while(get_bits(&s->gb, 8) == 255);
6917
6918         switch(type){
6919         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6920             if(decode_picture_timing(h) < 0)
6921                 return -1;
6922             break;
6923         case SEI_TYPE_USER_DATA_UNREGISTERED:
6924             if(decode_unregistered_user_data(h, size) < 0)
6925                 return -1;
6926             break;
6927         case SEI_TYPE_RECOVERY_POINT:
6928             if(decode_recovery_point(h) < 0)
6929                 return -1;
6930             break;
6931         case SEI_BUFFERING_PERIOD:
6932             if(decode_buffering_period(h) < 0)
6933                 return -1;
6934             break;
6935         default:
6936             skip_bits(&s->gb, 8*size);
6937         }
6938
6939         //FIXME check bits here
6940         align_get_bits(&s->gb);
6941     }
6942
6943     return 0;
6944 }
6945
6946 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6947     MpegEncContext * const s = &h->s;
6948     int cpb_count, i;
6949     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6950
6951     if(cpb_count > 32U){
6952         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6953         return -1;
6954     }
6955
6956     get_bits(&s->gb, 4); /* bit_rate_scale */
6957     get_bits(&s->gb, 4); /* cpb_size_scale */
6958     for(i=0; i<cpb_count; i++){
6959         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6960         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6961         get_bits1(&s->gb);     /* cbr_flag */
6962     }
6963     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6964     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6965     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6966     sps->time_offset_length = get_bits(&s->gb, 5);
6967     sps->cpb_cnt = cpb_count;
6968     return 0;
6969 }
6970
6971 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6972     MpegEncContext * const s = &h->s;
6973     int aspect_ratio_info_present_flag;
6974     unsigned int aspect_ratio_idc;
6975
6976     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6977
6978     if( aspect_ratio_info_present_flag ) {
6979         aspect_ratio_idc= get_bits(&s->gb, 8);
6980         if( aspect_ratio_idc == EXTENDED_SAR ) {
6981             sps->sar.num= get_bits(&s->gb, 16);
6982             sps->sar.den= get_bits(&s->gb, 16);
6983         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6984             sps->sar=  pixel_aspect[aspect_ratio_idc];
6985         }else{
6986             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6987             return -1;
6988         }
6989     }else{
6990         sps->sar.num=
6991         sps->sar.den= 0;
6992     }
6993 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6994
6995     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6996         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6997     }
6998
6999     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7000         get_bits(&s->gb, 3);    /* video_format */
7001         get_bits1(&s->gb);      /* video_full_range_flag */
7002         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7003             get_bits(&s->gb, 8); /* colour_primaries */
7004             get_bits(&s->gb, 8); /* transfer_characteristics */
7005             get_bits(&s->gb, 8); /* matrix_coefficients */
7006         }
7007     }
7008
7009     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7010         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7011         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7012     }
7013
7014     sps->timing_info_present_flag = get_bits1(&s->gb);
7015     if(sps->timing_info_present_flag){
7016         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7017         sps->time_scale = get_bits_long(&s->gb, 32);
7018         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7019     }
7020
7021     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7022     if(sps->nal_hrd_parameters_present_flag)
7023         if(decode_hrd_parameters(h, sps) < 0)
7024             return -1;
7025     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7026     if(sps->vcl_hrd_parameters_present_flag)
7027         if(decode_hrd_parameters(h, sps) < 0)
7028             return -1;
7029     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7030         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7031     sps->pic_struct_present_flag = get_bits1(&s->gb);
7032
7033     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7034     if(sps->bitstream_restriction_flag){
7035         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7036         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7037         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7038         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7039         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7040         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7041         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7042
7043         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7044             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7045             return -1;
7046         }
7047     }
7048
7049     return 0;
7050 }
7051
7052 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7053                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7054     MpegEncContext * const s = &h->s;
7055     int i, last = 8, next = 8;
7056     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7057     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7058         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7059     else
7060     for(i=0;i<size;i++){
7061         if(next)
7062             next = (last + get_se_golomb(&s->gb)) & 0xff;
7063         if(!i && !next){ /* matrix not written, we use the preset one */
7064             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7065             break;
7066         }
7067         last = factors[scan[i]] = next ? next : last;
7068     }
7069 }
7070
7071 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7072                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7073     MpegEncContext * const s = &h->s;
7074     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7075     const uint8_t *fallback[4] = {
7076         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7077         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7078         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7079         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7080     };
7081     if(get_bits1(&s->gb)){
7082         sps->scaling_matrix_present |= is_sps;
7083         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7084         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7085         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7086         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7087         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7088         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7089         if(is_sps || pps->transform_8x8_mode){
7090             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7091             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7092         }
7093     }
7094 }
7095
7096 int ff_h264_decode_seq_parameter_set(H264Context *h){
7097     MpegEncContext * const s = &h->s;
7098     int profile_idc, level_idc;
7099     unsigned int sps_id;
7100     int i;
7101     SPS *sps;
7102
7103     profile_idc= get_bits(&s->gb, 8);
7104     get_bits1(&s->gb);   //constraint_set0_flag
7105     get_bits1(&s->gb);   //constraint_set1_flag
7106     get_bits1(&s->gb);   //constraint_set2_flag
7107     get_bits1(&s->gb);   //constraint_set3_flag
7108     get_bits(&s->gb, 4); // reserved
7109     level_idc= get_bits(&s->gb, 8);
7110     sps_id= get_ue_golomb_31(&s->gb);
7111
7112     if(sps_id >= MAX_SPS_COUNT) {
7113         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7114         return -1;
7115     }
7116     sps= av_mallocz(sizeof(SPS));
7117     if(sps == NULL)
7118         return -1;
7119
7120     sps->profile_idc= profile_idc;
7121     sps->level_idc= level_idc;
7122
7123     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7124     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7125     sps->scaling_matrix_present = 0;
7126
7127     if(sps->profile_idc >= 100){ //high profile
7128         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7129         if(sps->chroma_format_idc == 3)
7130             sps->residual_color_transform_flag = get_bits1(&s->gb);
7131         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7132         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7133         sps->transform_bypass = get_bits1(&s->gb);
7134         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7135     }else{
7136         sps->chroma_format_idc= 1;
7137     }
7138
7139     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7140     sps->poc_type= get_ue_golomb_31(&s->gb);
7141
7142     if(sps->poc_type == 0){ //FIXME #define
7143         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7144     } else if(sps->poc_type == 1){//FIXME #define
7145         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7146         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7147         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7148         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7149
7150         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7151             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7152             goto fail;
7153         }
7154
7155         for(i=0; i<sps->poc_cycle_length; i++)
7156             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7157     }else if(sps->poc_type != 2){
7158         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7159         goto fail;
7160     }
7161
7162     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7163     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7164         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7165         goto fail;
7166     }
7167     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7168     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7169     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7170     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7171        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7172         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7173         goto fail;
7174     }
7175
7176     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7177     if(!sps->frame_mbs_only_flag)
7178         sps->mb_aff= get_bits1(&s->gb);
7179     else
7180         sps->mb_aff= 0;
7181
7182     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7183
7184 #ifndef ALLOW_INTERLACE
7185     if(sps->mb_aff)
7186         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7187 #endif
7188     sps->crop= get_bits1(&s->gb);
7189     if(sps->crop){
7190         sps->crop_left  = get_ue_golomb(&s->gb);
7191         sps->crop_right = get_ue_golomb(&s->gb);
7192         sps->crop_top   = get_ue_golomb(&s->gb);
7193         sps->crop_bottom= get_ue_golomb(&s->gb);
7194         if(sps->crop_left || sps->crop_top){
7195             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7196         }
7197         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7198             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7199         }
7200     }else{
7201         sps->crop_left  =
7202         sps->crop_right =
7203         sps->crop_top   =
7204         sps->crop_bottom= 0;
7205     }
7206
7207     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7208     if( sps->vui_parameters_present_flag )
7209         decode_vui_parameters(h, sps);
7210
7211     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7212         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7213                sps_id, sps->profile_idc, sps->level_idc,
7214                sps->poc_type,
7215                sps->ref_frame_count,
7216                sps->mb_width, sps->mb_height,
7217                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7218                sps->direct_8x8_inference_flag ? "8B8" : "",
7219                sps->crop_left, sps->crop_right,
7220                sps->crop_top, sps->crop_bottom,
7221                sps->vui_parameters_present_flag ? "VUI" : "",
7222                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7223                );
7224     }
7225
7226     av_free(h->sps_buffers[sps_id]);
7227     h->sps_buffers[sps_id]= sps;
7228     h->sps = *sps;
7229     return 0;
7230 fail:
7231     av_free(sps);
7232     return -1;
7233 }
7234
7235 static void
7236 build_qp_table(PPS *pps, int t, int index)
7237 {
7238     int i;
7239     for(i = 0; i < 52; i++)
7240         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7241 }
7242
7243 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7244     MpegEncContext * const s = &h->s;
7245     unsigned int pps_id= get_ue_golomb(&s->gb);
7246     PPS *pps;
7247
7248     if(pps_id >= MAX_PPS_COUNT) {
7249         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7250         return -1;
7251     }
7252
7253     pps= av_mallocz(sizeof(PPS));
7254     if(pps == NULL)
7255         return -1;
7256     pps->sps_id= get_ue_golomb_31(&s->gb);
7257     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7258         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7259         goto fail;
7260     }
7261
7262     pps->cabac= get_bits1(&s->gb);
7263     pps->pic_order_present= get_bits1(&s->gb);
7264     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7265     if(pps->slice_group_count > 1 ){
7266         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7267         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7268         switch(pps->mb_slice_group_map_type){
7269         case 0:
7270 #if 0
7271 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7272 |    run_length[ i ]                                |1  |ue(v)   |
7273 #endif
7274             break;
7275         case 2:
7276 #if 0
7277 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7278 |{                                                  |   |        |
7279 |    top_left_mb[ i ]                               |1  |ue(v)   |
7280 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7281 |   }                                               |   |        |
7282 #endif
7283             break;
7284         case 3:
7285         case 4:
7286         case 5:
7287 #if 0
7288 |   slice_group_change_direction_flag               |1  |u(1)    |
7289 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7290 #endif
7291             break;
7292         case 6:
7293 #if 0
7294 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7295 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7296 |)                                                  |   |        |
7297 |    slice_group_id[ i ]                            |1  |u(v)    |
7298 #endif
7299             break;
7300         }
7301     }
7302     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7303     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7304     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7305         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7306         goto fail;
7307     }
7308
7309     pps->weighted_pred= get_bits1(&s->gb);
7310     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7311     pps->init_qp= get_se_golomb(&s->gb) + 26;
7312     pps->init_qs= get_se_golomb(&s->gb) + 26;
7313     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7314     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7315     pps->constrained_intra_pred= get_bits1(&s->gb);
7316     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7317
7318     pps->transform_8x8_mode= 0;
7319     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7320     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7321     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7322
7323     if(get_bits_count(&s->gb) < bit_length){
7324         pps->transform_8x8_mode= get_bits1(&s->gb);
7325         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7326         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7327     } else {
7328         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7329     }
7330
7331     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7332     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7333     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7334         h->pps.chroma_qp_diff= 1;
7335
7336     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7337         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7338                pps_id, pps->sps_id,
7339                pps->cabac ? "CABAC" : "CAVLC",
7340                pps->slice_group_count,
7341                pps->ref_count[0], pps->ref_count[1],
7342                pps->weighted_pred ? "weighted" : "",
7343                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7344                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7345                pps->constrained_intra_pred ? "CONSTR" : "",
7346                pps->redundant_pic_cnt_present ? "REDU" : "",
7347                pps->transform_8x8_mode ? "8x8DCT" : ""
7348                );
7349     }
7350
7351     av_free(h->pps_buffers[pps_id]);
7352     h->pps_buffers[pps_id]= pps;
7353     return 0;
7354 fail:
7355     av_free(pps);
7356     return -1;
7357 }
7358
7359 /**
7360  * Call decode_slice() for each context.
7361  *
7362  * @param h h264 master context
7363  * @param context_count number of contexts to execute
7364  */
7365 static void execute_decode_slices(H264Context *h, int context_count){
7366     MpegEncContext * const s = &h->s;
7367     AVCodecContext * const avctx= s->avctx;
7368     H264Context *hx;
7369     int i;
7370
7371     if (s->avctx->hwaccel)
7372         return;
7373     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7374         return;
7375     if(context_count == 1) {
7376         decode_slice(avctx, &h);
7377     } else {
7378         for(i = 1; i < context_count; i++) {
7379             hx = h->thread_context[i];
7380             hx->s.error_recognition = avctx->error_recognition;
7381             hx->s.error_count = 0;
7382         }
7383
7384         avctx->execute(avctx, (void *)decode_slice,
7385                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7386
7387         /* pull back stuff from slices to master context */
7388         hx = h->thread_context[context_count - 1];
7389         s->mb_x = hx->s.mb_x;
7390         s->mb_y = hx->s.mb_y;
7391         s->dropable = hx->s.dropable;
7392         s->picture_structure = hx->s.picture_structure;
7393         for(i = 1; i < context_count; i++)
7394             h->s.error_count += h->thread_context[i]->s.error_count;
7395     }
7396 }
7397
7398
7399 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7400     MpegEncContext * const s = &h->s;
7401     AVCodecContext * const avctx= s->avctx;
7402     int buf_index=0;
7403     H264Context *hx; ///< thread context
7404     int context_count = 0;
7405
7406     h->max_contexts = avctx->thread_count;
7407 #if 0
7408     int i;
7409     for(i=0; i<50; i++){
7410         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7411     }
7412 #endif
7413     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7414         h->current_slice = 0;
7415         if (!s->first_field)
7416             s->current_picture_ptr= NULL;
7417         reset_sei(h);
7418     }
7419
7420     for(;;){
7421         int consumed;
7422         int dst_length;
7423         int bit_length;
7424         const uint8_t *ptr;
7425         int i, nalsize = 0;
7426         int err;
7427
7428         if(h->is_avc) {
7429             if(buf_index >= buf_size) break;
7430             nalsize = 0;
7431             for(i = 0; i < h->nal_length_size; i++)
7432                 nalsize = (nalsize << 8) | buf[buf_index++];
7433             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7434                 if(nalsize == 1){
7435                     buf_index++;
7436                     continue;
7437                 }else{
7438                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7439                     break;
7440                 }
7441             }
7442         } else {
7443             // start code prefix search
7444             for(; buf_index + 3 < buf_size; buf_index++){
7445                 // This should always succeed in the first iteration.
7446                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7447                     break;
7448             }
7449
7450             if(buf_index+3 >= buf_size) break;
7451
7452             buf_index+=3;
7453         }
7454
7455         hx = h->thread_context[context_count];
7456
7457         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7458         if (ptr==NULL || dst_length < 0){
7459             return -1;
7460         }
7461         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7462             dst_length--;
7463         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7464
7465         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7466             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7467         }
7468
7469         if (h->is_avc && (nalsize != consumed)){
7470             int i, debug_level = AV_LOG_DEBUG;
7471             for (i = consumed; i < nalsize; i++)
7472                 if (buf[buf_index+i])
7473                     debug_level = AV_LOG_ERROR;
7474             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7475             consumed= nalsize;
7476         }
7477
7478         buf_index += consumed;
7479
7480         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7481            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7482             continue;
7483
7484       again:
7485         err = 0;
7486         switch(hx->nal_unit_type){
7487         case NAL_IDR_SLICE:
7488             if (h->nal_unit_type != NAL_IDR_SLICE) {
7489                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7490                 return -1;
7491             }
7492             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7493         case NAL_SLICE:
7494             init_get_bits(&hx->s.gb, ptr, bit_length);
7495             hx->intra_gb_ptr=
7496             hx->inter_gb_ptr= &hx->s.gb;
7497             hx->s.data_partitioning = 0;
7498
7499             if((err = decode_slice_header(hx, h)))
7500                break;
7501
7502             if (s->avctx->hwaccel && h->current_slice == 1) {
7503                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7504                     return -1;
7505             }
7506
7507             s->current_picture_ptr->key_frame |=
7508                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7509                     (h->sei_recovery_frame_cnt >= 0);
7510             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7511                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7512                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7513                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7514                && avctx->skip_frame < AVDISCARD_ALL){
7515                 if(avctx->hwaccel) {
7516                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7517                         return -1;
7518                 }else
7519                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7520                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7521                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7522                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7523                 }else
7524                     context_count++;
7525             }
7526             break;
7527         case NAL_DPA:
7528             init_get_bits(&hx->s.gb, ptr, bit_length);
7529             hx->intra_gb_ptr=
7530             hx->inter_gb_ptr= NULL;
7531             hx->s.data_partitioning = 1;
7532
7533             err = decode_slice_header(hx, h);
7534             break;
7535         case NAL_DPB:
7536             init_get_bits(&hx->intra_gb, ptr, bit_length);
7537             hx->intra_gb_ptr= &hx->intra_gb;
7538             break;
7539         case NAL_DPC:
7540             init_get_bits(&hx->inter_gb, ptr, bit_length);
7541             hx->inter_gb_ptr= &hx->inter_gb;
7542
7543             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7544                && s->context_initialized
7545                && s->hurry_up < 5
7546                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7547                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7548                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7549                && avctx->skip_frame < AVDISCARD_ALL)
7550                 context_count++;
7551             break;
7552         case NAL_SEI:
7553             init_get_bits(&s->gb, ptr, bit_length);
7554             ff_h264_decode_sei(h);
7555             break;
7556         case NAL_SPS:
7557             init_get_bits(&s->gb, ptr, bit_length);
7558             ff_h264_decode_seq_parameter_set(h);
7559
7560             if(s->flags& CODEC_FLAG_LOW_DELAY)
7561                 s->low_delay=1;
7562
7563             if(avctx->has_b_frames < 2)
7564                 avctx->has_b_frames= !s->low_delay;
7565             break;
7566         case NAL_PPS:
7567             init_get_bits(&s->gb, ptr, bit_length);
7568
7569             ff_h264_decode_picture_parameter_set(h, bit_length);
7570
7571             break;
7572         case NAL_AUD:
7573         case NAL_END_SEQUENCE:
7574         case NAL_END_STREAM:
7575         case NAL_FILLER_DATA:
7576         case NAL_SPS_EXT:
7577         case NAL_AUXILIARY_SLICE:
7578             break;
7579         default:
7580             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7581         }
7582
7583         if(context_count == h->max_contexts) {
7584             execute_decode_slices(h, context_count);
7585             context_count = 0;
7586         }
7587
7588         if (err < 0)
7589             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7590         else if(err == 1) {
7591             /* Slice could not be decoded in parallel mode, copy down
7592              * NAL unit stuff to context 0 and restart. Note that
7593              * rbsp_buffer is not transferred, but since we no longer
7594              * run in parallel mode this should not be an issue. */
7595             h->nal_unit_type = hx->nal_unit_type;
7596             h->nal_ref_idc   = hx->nal_ref_idc;
7597             hx = h;
7598             goto again;
7599         }
7600     }
7601     if(context_count)
7602         execute_decode_slices(h, context_count);
7603     return buf_index;
7604 }
7605
7606 /**
7607  * returns the number of bytes consumed for building the current frame
7608  */
7609 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7610         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7611         if(pos+10>buf_size) pos=buf_size; // oops ;)
7612
7613         return pos;
7614 }
7615
7616 static int decode_frame(AVCodecContext *avctx,
7617                              void *data, int *data_size,
7618                              const uint8_t *buf, int buf_size)
7619 {
7620     H264Context *h = avctx->priv_data;
7621     MpegEncContext *s = &h->s;
7622     AVFrame *pict = data;
7623     int buf_index;
7624
7625     s->flags= avctx->flags;
7626     s->flags2= avctx->flags2;
7627
7628    /* end of stream, output what is still in the buffers */
7629     if (buf_size == 0) {
7630         Picture *out;
7631         int i, out_idx;
7632
7633 //FIXME factorize this with the output code below
7634         out = h->delayed_pic[0];
7635         out_idx = 0;
7636         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7637             if(h->delayed_pic[i]->poc < out->poc){
7638                 out = h->delayed_pic[i];
7639                 out_idx = i;
7640             }
7641
7642         for(i=out_idx; h->delayed_pic[i]; i++)
7643             h->delayed_pic[i] = h->delayed_pic[i+1];
7644
7645         if(out){
7646             *data_size = sizeof(AVFrame);
7647             *pict= *(AVFrame*)out;
7648         }
7649
7650         return 0;
7651     }
7652
7653     if(h->is_avc && !h->got_avcC) {
7654         int i, cnt, nalsize;
7655         unsigned char *p = avctx->extradata;
7656         if(avctx->extradata_size < 7) {
7657             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7658             return -1;
7659         }
7660         if(*p != 1) {
7661             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7662             return -1;
7663         }
7664         /* sps and pps in the avcC always have length coded with 2 bytes,
7665            so put a fake nal_length_size = 2 while parsing them */
7666         h->nal_length_size = 2;
7667         // Decode sps from avcC
7668         cnt = *(p+5) & 0x1f; // Number of sps
7669         p += 6;
7670         for (i = 0; i < cnt; i++) {
7671             nalsize = AV_RB16(p) + 2;
7672             if(decode_nal_units(h, p, nalsize) < 0) {
7673                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7674                 return -1;
7675             }
7676             p += nalsize;
7677         }
7678         // Decode pps from avcC
7679         cnt = *(p++); // Number of pps
7680         for (i = 0; i < cnt; i++) {
7681             nalsize = AV_RB16(p) + 2;
7682             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7683                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7684                 return -1;
7685             }
7686             p += nalsize;
7687         }
7688         // Now store right nal length size, that will be use to parse all other nals
7689         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7690         // Do not reparse avcC
7691         h->got_avcC = 1;
7692     }
7693
7694     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7695         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7696             return -1;
7697         h->got_avcC = 1;
7698     }
7699
7700     buf_index=decode_nal_units(h, buf, buf_size);
7701     if(buf_index < 0)
7702         return -1;
7703
7704     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7705         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7706         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7707         return -1;
7708     }
7709
7710     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7711         Picture *out = s->current_picture_ptr;
7712         Picture *cur = s->current_picture_ptr;
7713         int i, pics, cross_idr, out_of_order, out_idx;
7714
7715         s->mb_y= 0;
7716
7717         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7718         s->current_picture_ptr->pict_type= s->pict_type;
7719
7720         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7721             ff_vdpau_h264_set_reference_frames(s);
7722
7723         if(!s->dropable) {
7724             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7725             h->prev_poc_msb= h->poc_msb;
7726             h->prev_poc_lsb= h->poc_lsb;
7727         }
7728         h->prev_frame_num_offset= h->frame_num_offset;
7729         h->prev_frame_num= h->frame_num;
7730
7731         if (avctx->hwaccel) {
7732             if (avctx->hwaccel->end_frame(avctx) < 0)
7733                 av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
7734         }
7735
7736         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7737             ff_vdpau_h264_picture_complete(s);
7738
7739         /*
7740          * FIXME: Error handling code does not seem to support interlaced
7741          * when slices span multiple rows
7742          * The ff_er_add_slice calls don't work right for bottom
7743          * fields; they cause massive erroneous error concealing
7744          * Error marking covers both fields (top and bottom).
7745          * This causes a mismatched s->error_count
7746          * and a bad error table. Further, the error count goes to
7747          * INT_MAX when called for bottom field, because mb_y is
7748          * past end by one (callers fault) and resync_mb_y != 0
7749          * causes problems for the first MB line, too.
7750          */
7751         if (!FIELD_PICTURE)
7752             ff_er_frame_end(s);
7753
7754         MPV_frame_end(s);
7755
7756         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7757             /* Wait for second field. */
7758             *data_size = 0;
7759
7760         } else {
7761             cur->repeat_pict = 0;
7762
7763             /* Signal interlacing information externally. */
7764             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7765             if(h->sps.pic_struct_present_flag){
7766                 switch (h->sei_pic_struct)
7767                 {
7768                 case SEI_PIC_STRUCT_FRAME:
7769                     cur->interlaced_frame = 0;
7770                     break;
7771                 case SEI_PIC_STRUCT_TOP_FIELD:
7772                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7773                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7774                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7775                     cur->interlaced_frame = 1;
7776                     break;
7777                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7778                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7779                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7780                     // From these hints, let the applications decide if they apply deinterlacing.
7781                     cur->repeat_pict = 1;
7782                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7783                     break;
7784                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7785                     // Force progressive here, as doubling interlaced frame is a bad idea.
7786                     cur->interlaced_frame = 0;
7787                     cur->repeat_pict = 2;
7788                     break;
7789                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7790                     cur->interlaced_frame = 0;
7791                     cur->repeat_pict = 4;
7792                     break;
7793                 }
7794             }else{
7795                 /* Derive interlacing flag from used decoding process. */
7796                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7797             }
7798
7799             if (cur->field_poc[0] != cur->field_poc[1]){
7800                 /* Derive top_field_first from field pocs. */
7801                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7802             }else{
7803                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7804                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7805                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7806                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7807                         cur->top_field_first = 1;
7808                     else
7809                         cur->top_field_first = 0;
7810                 }else{
7811                     /* Most likely progressive */
7812                     cur->top_field_first = 0;
7813                 }
7814             }
7815
7816         //FIXME do something with unavailable reference frames
7817
7818             /* Sort B-frames into display order */
7819
7820             if(h->sps.bitstream_restriction_flag
7821                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7822                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7823                 s->low_delay = 0;
7824             }
7825
7826             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7827                && !h->sps.bitstream_restriction_flag){
7828                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7829                 s->low_delay= 0;
7830             }
7831
7832             pics = 0;
7833             while(h->delayed_pic[pics]) pics++;
7834
7835             assert(pics <= MAX_DELAYED_PIC_COUNT);
7836
7837             h->delayed_pic[pics++] = cur;
7838             if(cur->reference == 0)
7839                 cur->reference = DELAYED_PIC_REF;
7840
7841             out = h->delayed_pic[0];
7842             out_idx = 0;
7843             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7844                 if(h->delayed_pic[i]->poc < out->poc){
7845                     out = h->delayed_pic[i];
7846                     out_idx = i;
7847                 }
7848             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7849
7850             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7851
7852             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7853                 { }
7854             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7855                || (s->low_delay &&
7856                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7857                  || cur->pict_type == FF_B_TYPE)))
7858             {
7859                 s->low_delay = 0;
7860                 s->avctx->has_b_frames++;
7861             }
7862
7863             if(out_of_order || pics > s->avctx->has_b_frames){
7864                 out->reference &= ~DELAYED_PIC_REF;
7865                 for(i=out_idx; h->delayed_pic[i]; i++)
7866                     h->delayed_pic[i] = h->delayed_pic[i+1];
7867             }
7868             if(!out_of_order && pics > s->avctx->has_b_frames){
7869                 *data_size = sizeof(AVFrame);
7870
7871                 h->outputed_poc = out->poc;
7872                 *pict= *(AVFrame*)out;
7873             }else{
7874                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7875             }
7876         }
7877     }
7878
7879     assert(pict->data[0] || !*data_size);
7880     ff_print_debug_info(s, pict);
7881 //printf("out %d\n", (int)pict->data[0]);
7882 #if 0 //?
7883
7884     /* Return the Picture timestamp as the frame number */
7885     /* we subtract 1 because it is added on utils.c     */
7886     avctx->frame_number = s->picture_number - 1;
7887 #endif
7888     return get_consumed_bytes(s, buf_index, buf_size);
7889 }
7890 #if 0
7891 static inline void fill_mb_avail(H264Context *h){
7892     MpegEncContext * const s = &h->s;
7893     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7894
7895     if(s->mb_y){
7896         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7897         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7898         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7899     }else{
7900         h->mb_avail[0]=
7901         h->mb_avail[1]=
7902         h->mb_avail[2]= 0;
7903     }
7904     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7905     h->mb_avail[4]= 1; //FIXME move out
7906     h->mb_avail[5]= 0; //FIXME move out
7907 }
7908 #endif
7909
7910 #ifdef TEST
7911 #undef printf
7912 #undef random
7913 #define COUNT 8000
7914 #define SIZE (COUNT*40)
7915 int main(void){
7916     int i;
7917     uint8_t temp[SIZE];
7918     PutBitContext pb;
7919     GetBitContext gb;
7920 //    int int_temp[10000];
7921     DSPContext dsp;
7922     AVCodecContext avctx;
7923
7924     dsputil_init(&dsp, &avctx);
7925
7926     init_put_bits(&pb, temp, SIZE);
7927     printf("testing unsigned exp golomb\n");
7928     for(i=0; i<COUNT; i++){
7929         START_TIMER
7930         set_ue_golomb(&pb, i);
7931         STOP_TIMER("set_ue_golomb");
7932     }
7933     flush_put_bits(&pb);
7934
7935     init_get_bits(&gb, temp, 8*SIZE);
7936     for(i=0; i<COUNT; i++){
7937         int j, s;
7938
7939         s= show_bits(&gb, 24);
7940
7941         START_TIMER
7942         j= get_ue_golomb(&gb);
7943         if(j != i){
7944             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7945 //            return -1;
7946         }
7947         STOP_TIMER("get_ue_golomb");
7948     }
7949
7950
7951     init_put_bits(&pb, temp, SIZE);
7952     printf("testing signed exp golomb\n");
7953     for(i=0; i<COUNT; i++){
7954         START_TIMER
7955         set_se_golomb(&pb, i - COUNT/2);
7956         STOP_TIMER("set_se_golomb");
7957     }
7958     flush_put_bits(&pb);
7959
7960     init_get_bits(&gb, temp, 8*SIZE);
7961     for(i=0; i<COUNT; i++){
7962         int j, s;
7963
7964         s= show_bits(&gb, 24);
7965
7966         START_TIMER
7967         j= get_se_golomb(&gb);
7968         if(j != i - COUNT/2){
7969             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7970 //            return -1;
7971         }
7972         STOP_TIMER("get_se_golomb");
7973     }
7974
7975 #if 0
7976     printf("testing 4x4 (I)DCT\n");
7977
7978     DCTELEM block[16];
7979     uint8_t src[16], ref[16];
7980     uint64_t error= 0, max_error=0;
7981
7982     for(i=0; i<COUNT; i++){
7983         int j;
7984 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7985         for(j=0; j<16; j++){
7986             ref[j]= random()%255;
7987             src[j]= random()%255;
7988         }
7989
7990         h264_diff_dct_c(block, src, ref, 4);
7991
7992         //normalize
7993         for(j=0; j<16; j++){
7994 //            printf("%d ", block[j]);
7995             block[j]= block[j]*4;
7996             if(j&1) block[j]= (block[j]*4 + 2)/5;
7997             if(j&4) block[j]= (block[j]*4 + 2)/5;
7998         }
7999 //        printf("\n");
8000
8001         s->dsp.h264_idct_add(ref, block, 4);
8002 /*        for(j=0; j<16; j++){
8003             printf("%d ", ref[j]);
8004         }
8005         printf("\n");*/
8006
8007         for(j=0; j<16; j++){
8008             int diff= FFABS(src[j] - ref[j]);
8009
8010             error+= diff*diff;
8011             max_error= FFMAX(max_error, diff);
8012         }
8013     }
8014     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8015     printf("testing quantizer\n");
8016     for(qp=0; qp<52; qp++){
8017         for(i=0; i<16; i++)
8018             src1_block[i]= src2_block[i]= random()%255;
8019
8020     }
8021     printf("Testing NAL layer\n");
8022
8023     uint8_t bitstream[COUNT];
8024     uint8_t nal[COUNT*2];
8025     H264Context h;
8026     memset(&h, 0, sizeof(H264Context));
8027
8028     for(i=0; i<COUNT; i++){
8029         int zeros= i;
8030         int nal_length;
8031         int consumed;
8032         int out_length;
8033         uint8_t *out;
8034         int j;
8035
8036         for(j=0; j<COUNT; j++){
8037             bitstream[j]= (random() % 255) + 1;
8038         }
8039
8040         for(j=0; j<zeros; j++){
8041             int pos= random() % COUNT;
8042             while(bitstream[pos] == 0){
8043                 pos++;
8044                 pos %= COUNT;
8045             }
8046             bitstream[pos]=0;
8047         }
8048
8049         START_TIMER
8050
8051         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8052         if(nal_length<0){
8053             printf("encoding failed\n");
8054             return -1;
8055         }
8056
8057         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8058
8059         STOP_TIMER("NAL")
8060
8061         if(out_length != COUNT){
8062             printf("incorrect length %d %d\n", out_length, COUNT);
8063             return -1;
8064         }
8065
8066         if(consumed != nal_length){
8067             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8068             return -1;
8069         }
8070
8071         if(memcmp(bitstream, out, COUNT)){
8072             printf("mismatch\n");
8073             return -1;
8074         }
8075     }
8076 #endif
8077
8078     printf("Testing RBSP\n");
8079
8080
8081     return 0;
8082 }
8083 #endif /* TEST */
8084
8085
8086 static av_cold int decode_end(AVCodecContext *avctx)
8087 {
8088     H264Context *h = avctx->priv_data;
8089     MpegEncContext *s = &h->s;
8090     int i;
8091
8092     av_freep(&h->rbsp_buffer[0]);
8093     av_freep(&h->rbsp_buffer[1]);
8094     free_tables(h); //FIXME cleanup init stuff perhaps
8095
8096     for(i = 0; i < MAX_SPS_COUNT; i++)
8097         av_freep(h->sps_buffers + i);
8098
8099     for(i = 0; i < MAX_PPS_COUNT; i++)
8100         av_freep(h->pps_buffers + i);
8101
8102     MPV_common_end(s);
8103
8104 //    memset(h, 0, sizeof(H264Context));
8105
8106     return 0;
8107 }
8108
8109
8110 AVCodec h264_decoder = {
8111     "h264",
8112     CODEC_TYPE_VIDEO,
8113     CODEC_ID_H264,
8114     sizeof(H264Context),
8115     decode_init,
8116     NULL,
8117     decode_end,
8118     decode_frame,
8119     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8120     .flush= flush_dpb,
8121     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8122     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8123 };
8124
8125 #if CONFIG_H264_VDPAU_DECODER
8126 AVCodec h264_vdpau_decoder = {
8127     "h264_vdpau",
8128     CODEC_TYPE_VIDEO,
8129     CODEC_ID_H264,
8130     sizeof(H264Context),
8131     decode_init,
8132     NULL,
8133     decode_end,
8134     decode_frame,
8135     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8136     .flush= flush_dpb,
8137     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8138 };
8139 #endif
8140
8141 #if CONFIG_SVQ3_DECODER
8142 #include "svq3.c"
8143 #endif