git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36 #include "vdpau_internal.h"
  37
  38 #include "cabac.h"
  39 #ifdef ARCH_X86
  40 #include "x86/h264_i386.h"
  41 #endif
  42
  43 //#undef NDEBUG
  44 #include <assert.h>
  45
  46 /**
  47  * Value of Picture.reference when Picture is not a reference picture, but
  48  * is held for delayed output.
  49  */
  50 #define DELAYED_PIC_REF 4
  51
  52 static VLC coeff_token_vlc[4];
  53 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  54 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  55
  56 static VLC chroma_dc_coeff_token_vlc;
  57 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  58 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  59
  60 static VLC total_zeros_vlc[15];
  61 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  62 static const int total_zeros_vlc_tables_size = 512;
  63
  64 static VLC chroma_dc_total_zeros_vlc[3];
  65 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  66 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  67
  68 static VLC run_vlc[6];
  69 static VLC_TYPE run_vlc_tables[6][8][2];
  70 static const int run_vlc_tables_size = 8;
  71
  72 static VLC run7_vlc;
  73 static VLC_TYPE run7_vlc_table[96][2];
  74 static const int run7_vlc_table_size = 96;
  75
  76 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  77 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  78 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  80 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  81
  82 static av_always_inline uint32_t pack16to32(int a, int b){
  83 #ifdef WORDS_BIGENDIAN
  84    return (b&0xFFFF) + (a<<16);
  85 #else
  86    return (a&0xFFFF) + (b<<16);
  87 #endif
  88 }
  89
  90 static const uint8_t rem6[52]={
  91 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  92 };
  93
  94 static const uint8_t div6[52]={
  95 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  96 };
  97
  98 static const int left_block_options[4][8]={
  99     {0,1,2,3,7,10,8,11},
 100     {2,2,3,3,8,11,8,11},
 101     {0,0,1,1,7,10,7,10},
 102     {0,2,0,2,7,10,7,10}
 103 };
 104
 105 #define LEVEL_TAB_BITS 8
 106 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 107
 108 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 109     MpegEncContext * const s = &h->s;
 110     const int mb_xy= h->mb_xy;
 111     int topleft_xy, top_xy, topright_xy, left_xy[2];
 112     int topleft_type, top_type, topright_type, left_type[2];
 113     const int * left_block;
 114     int topleft_partition= -1;
 115     int i;
 116
 117     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 118
 119     //FIXME deblocking could skip the intra and nnz parts.
 120     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 121         return;
 122
 123     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 124      * stuff, I can't imagine that these complex rules are worth it. */
 125
 126     topleft_xy = top_xy - 1;
 127     topright_xy= top_xy + 1;
 128     left_xy[1] = left_xy[0] = mb_xy-1;
 129     left_block = left_block_options[0];
 130     if(FRAME_MBAFF){
 131         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 132         const int top_pair_xy      = pair_xy     - s->mb_stride;
 133         const int topleft_pair_xy  = top_pair_xy - 1;
 134         const int topright_pair_xy = top_pair_xy + 1;
 135         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 136         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 137         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 138         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 139         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 140         const int bottom = (s->mb_y & 1);
 141         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 142
 143         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 144             top_xy -= s->mb_stride;
 145         }
 146         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 147             topleft_xy -= s->mb_stride;
 148         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 149             topleft_xy += s->mb_stride;
 150             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 151             topleft_partition = 0;
 152         }
 153         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 154             topright_xy -= s->mb_stride;
 155         }
 156         if (left_mb_field_flag != curr_mb_field_flag) {
 157             left_xy[1] = left_xy[0] = pair_xy - 1;
 158             if (curr_mb_field_flag) {
 159                 left_xy[1] += s->mb_stride;
 160                 left_block = left_block_options[3];
 161             } else {
 162                 left_block= left_block_options[2 - bottom];
 163             }
 164         }
 165     }
 166
 167     h->top_mb_xy = top_xy;
 168     h->left_mb_xy[0] = left_xy[0];
 169     h->left_mb_xy[1] = left_xy[1];
 170     if(for_deblock){
 171         topleft_type = 0;
 172         topright_type = 0;
 173         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 174         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 175         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 176
 177         if(MB_MBAFF && !IS_INTRA(mb_type)){
 178             int list;
 179             for(list=0; list<h->list_count; list++){
 180                 //These values where changed for ease of performing MC, we need to change them back
 181                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 182                 //the MC code from changing ref_cache and rather use a temporary array.
 183                 if(USES_LIST(mb_type,list)){
 184                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 185                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 186                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 187                     ref += h->b8_stride;
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 189                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 190                 }
 191             }
 192         }
 193     }else{
 194         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 195         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 196         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 197         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 198         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 199
 200     if(IS_INTRA(mb_type)){
 201         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 202         h->topleft_samples_available=
 203         h->top_samples_available=
 204         h->left_samples_available= 0xFFFF;
 205         h->topright_samples_available= 0xEEEA;
 206
 207         if(!(top_type & type_mask)){
 208             h->topleft_samples_available= 0xB3FF;
 209             h->top_samples_available= 0x33FF;
 210             h->topright_samples_available= 0x26EA;
 211         }
 212         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 213             if(IS_INTERLACED(mb_type)){
 214                 if(!(left_type[0] & type_mask)){
 215                     h->topleft_samples_available&= 0xDFFF;
 216                     h->left_samples_available&= 0x5FFF;
 217                 }
 218                 if(!(left_type[1] & type_mask)){
 219                     h->topleft_samples_available&= 0xFF5F;
 220                     h->left_samples_available&= 0xFF5F;
 221                 }
 222             }else{
 223                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 224                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 225                 assert(left_xy[0] == left_xy[1]);
 226                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 227                     h->topleft_samples_available&= 0xDF5F;
 228                     h->left_samples_available&= 0x5F5F;
 229                 }
 230             }
 231         }else{
 232             if(!(left_type[0] & type_mask)){
 233                 h->topleft_samples_available&= 0xDF5F;
 234                 h->left_samples_available&= 0x5F5F;
 235             }
 236         }
 237
 238         if(!(topleft_type & type_mask))
 239             h->topleft_samples_available&= 0x7FFF;
 240
 241         if(!(topright_type & type_mask))
 242             h->topright_samples_available&= 0xFBFF;
 243
 244         if(IS_INTRA4x4(mb_type)){
 245             if(IS_INTRA4x4(top_type)){
 246                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 247                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 248                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 249                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 250             }else{
 251                 int pred;
 252                 if(!(top_type & type_mask))
 253                     pred= -1;
 254                 else{
 255                     pred= 2;
 256                 }
 257                 h->intra4x4_pred_mode_cache[4+8*0]=
 258                 h->intra4x4_pred_mode_cache[5+8*0]=
 259                 h->intra4x4_pred_mode_cache[6+8*0]=
 260                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 261             }
 262             for(i=0; i<2; i++){
 263                 if(IS_INTRA4x4(left_type[i])){
 264                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 265                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 266                 }else{
 267                     int pred;
 268                     if(!(left_type[i] & type_mask))
 269                         pred= -1;
 270                     else{
 271                         pred= 2;
 272                     }
 273                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 274                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 275                 }
 276             }
 277         }
 278     }
 279     }
 280
 281
 282 /*
 283 0 . T T. T T T T
 284 1 L . .L . . . .
 285 2 L . .L . . . .
 286 3 . T TL . . . .
 287 4 L . .L . . . .
 288 5 L . .. . . . .
 289 */
 290 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 291     if(top_type){
 292         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 293         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 294         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 295         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 296
 297         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 298         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 299
 300         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 301         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 302
 303     }else{
 304         h->non_zero_count_cache[4+8*0]=
 305         h->non_zero_count_cache[5+8*0]=
 306         h->non_zero_count_cache[6+8*0]=
 307         h->non_zero_count_cache[7+8*0]=
 308
 309         h->non_zero_count_cache[1+8*0]=
 310         h->non_zero_count_cache[2+8*0]=
 311
 312         h->non_zero_count_cache[1+8*3]=
 313         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 314
 315     }
 316
 317     for (i=0; i<2; i++) {
 318         if(left_type[i]){
 319             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 320             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 321             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 322             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 323         }else{
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 326             h->non_zero_count_cache[0+8*1 +   8*i]=
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 328         }
 329     }
 330
 331     if( h->pps.cabac ) {
 332         // top_cbp
 333         if(top_type) {
 334             h->top_cbp = h->cbp_table[top_xy];
 335         } else if(IS_INTRA(mb_type)) {
 336             h->top_cbp = 0x1C0;
 337         } else {
 338             h->top_cbp = 0;
 339         }
 340         // left_cbp
 341         if (left_type[0]) {
 342             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 343         } else if(IS_INTRA(mb_type)) {
 344             h->left_cbp = 0x1C0;
 345         } else {
 346             h->left_cbp = 0;
 347         }
 348         if (left_type[0]) {
 349             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 350         }
 351         if (left_type[1]) {
 352             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 353         }
 354     }
 355
 356 #if 1
 357     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 358         int list;
 359         for(list=0; list<h->list_count; list++){
 360             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 361                 /*if(!h->mv_cache_clean[list]){
 362                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 363                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 364                     h->mv_cache_clean[list]= 1;
 365                 }*/
 366                 continue;
 367             }
 368             h->mv_cache_clean[list]= 0;
 369
 370             if(USES_LIST(top_type, list)){
 371                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 372                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 373                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 377                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 378                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 379                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 381             }else{
 382                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 386                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 387             }
 388
 389             for(i=0; i<2; i++){
 390                 int cache_idx = scan8[0] - 1 + i*2*8;
 391                 if(USES_LIST(left_type[i], list)){
 392                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 393                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 394                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 395                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 396                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 397                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 398                 }else{
 399                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 400                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 401                     h->ref_cache[list][cache_idx  ]=
 402                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 403                 }
 404             }
 405
 406             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 407                 continue;
 408
 409             if(USES_LIST(topleft_type, list)){
 410                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 411                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 412                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 413                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 414             }else{
 415                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 416                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 417             }
 418
 419             if(USES_LIST(topright_type, list)){
 420                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 421                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 422                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 423                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 424             }else{
 425                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 426                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 427             }
 428
 429             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 430                 continue;
 431
 432             h->ref_cache[list][scan8[5 ]+1] =
 433             h->ref_cache[list][scan8[7 ]+1] =
 434             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 435             h->ref_cache[list][scan8[4 ]] =
 436             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 437             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 438             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 439             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 440             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 441             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 442
 443             if( h->pps.cabac ) {
 444                 /* XXX beurk, Load mvd */
 445                 if(USES_LIST(top_type, list)){
 446                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 447                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 451                 }else{
 452                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 456                 }
 457                 if(USES_LIST(left_type[0], list)){
 458                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 459                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 460                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 461                 }else{
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[1], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 472                 }
 473                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 474                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 475                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 476                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 478
 479                 if(h->slice_type_nos == FF_B_TYPE){
 480                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 481
 482                     if(IS_DIRECT(top_type)){
 483                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 484                     }else if(IS_8X8(top_type)){
 485                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 486                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 487                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 488                     }else{
 489                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 490                     }
 491
 492                     if(IS_DIRECT(left_type[0]))
 493                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 494                     else if(IS_8X8(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 496                     else
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 498
 499                     if(IS_DIRECT(left_type[1]))
 500                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 501                     else if(IS_8X8(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 503                     else
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 505                 }
 506             }
 507
 508             if(FRAME_MBAFF){
 509 #define MAP_MVS\
 510                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 511                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 512                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 516                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 517                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 518                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 519                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 520                 if(MB_FIELD){
 521 #define MAP_F2F(idx, mb_type)\
 522                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 523                         h->ref_cache[list][idx] <<= 1;\
 524                         h->mv_cache[list][idx][1] /= 2;\
 525                         h->mvd_cache[list][idx][1] /= 2;\
 526                     }
 527                     MAP_MVS
 528 #undef MAP_F2F
 529                 }else{
 530 #define MAP_F2F(idx, mb_type)\
 531                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 532                         h->ref_cache[list][idx] >>= 1;\
 533                         h->mv_cache[list][idx][1] <<= 1;\
 534                         h->mvd_cache[list][idx][1] <<= 1;\
 535                     }
 536                     MAP_MVS
 537 #undef MAP_F2F
 538                 }
 539             }
 540         }
 541     }
 542 #endif
 543
 544     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 545 }
 546
 547 static inline void write_back_intra_pred_mode(H264Context *h){
 548     const int mb_xy= h->mb_xy;
 549
 550     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 551     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 552     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 553     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 554     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 555     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 556     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 557 }
 558
 559 /**
 560  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 561  */
 562 static inline int check_intra4x4_pred_mode(H264Context *h){
 563     MpegEncContext * const s = &h->s;
 564     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 565     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 566     int i;
 567
 568     if(!(h->top_samples_available&0x8000)){
 569         for(i=0; i<4; i++){
 570             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 571             if(status<0){
 572                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 573                 return -1;
 574             } else if(status){
 575                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 576             }
 577         }
 578     }
 579
 580     if((h->left_samples_available&0x8888)!=0x8888){
 581         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 582         for(i=0; i<4; i++){
 583             if(!(h->left_samples_available&mask[i])){
 584                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 585                 if(status<0){
 586                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 587                     return -1;
 588                 } else if(status){
 589                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 590                 }
 591             }
 592         }
 593     }
 594
 595     return 0;
 596 } //FIXME cleanup like next
 597
 598 /**
 599  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 600  */
 601 static inline int check_intra_pred_mode(H264Context *h, int mode){
 602     MpegEncContext * const s = &h->s;
 603     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 604     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 605
 606     if(mode > 6U) {
 607         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 608         return -1;
 609     }
 610
 611     if(!(h->top_samples_available&0x8000)){
 612         mode= top[ mode ];
 613         if(mode<0){
 614             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 615             return -1;
 616         }
 617     }
 618
 619     if((h->left_samples_available&0x8080) != 0x8080){
 620         mode= left[ mode ];
 621         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 622             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 623         }
 624         if(mode<0){
 625             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 626             return -1;
 627         }
 628     }
 629
 630     return mode;
 631 }
 632
 633 /**
 634  * gets the predicted intra4x4 prediction mode.
 635  */
 636 static inline int pred_intra_mode(H264Context *h, int n){
 637     const int index8= scan8[n];
 638     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 639     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 640     const int min= FFMIN(left, top);
 641
 642     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 643
 644     if(min<0) return DC_PRED;
 645     else      return min;
 646 }
 647
 648 static inline void write_back_non_zero_count(H264Context *h){
 649     const int mb_xy= h->mb_xy;
 650
 651     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 652     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 653     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 654     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 655     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 656     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 657     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 658
 659     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 660     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 661     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 662
 663     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 664     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 665     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 666 }
 667
 668 /**
 669  * gets the predicted number of non-zero coefficients.
 670  * @param n block index
 671  */
 672 static inline int pred_non_zero_count(H264Context *h, int n){
 673     const int index8= scan8[n];
 674     const int left= h->non_zero_count_cache[index8 - 1];
 675     const int top = h->non_zero_count_cache[index8 - 8];
 676     int i= left + top;
 677
 678     if(i<64) i= (i+1)>>1;
 679
 680     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 681
 682     return i&31;
 683 }
 684
 685 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 686     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 687     MpegEncContext *s = &h->s;
 688
 689     /* there is no consistent mapping of mvs to neighboring locations that will
 690      * make mbaff happy, so we can't move all this logic to fill_caches */
 691     if(FRAME_MBAFF){
 692         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 693         const int16_t *mv;
 694         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 695         *C = h->mv_cache[list][scan8[0]-2];
 696
 697         if(!MB_FIELD
 698            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 699             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 700             if(IS_INTERLACED(mb_types[topright_xy])){
 701 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 702                 const int x4 = X4, y4 = Y4;\
 703                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 704                 if(!USES_LIST(mb_type,list))\
 705                     return LIST_NOT_USED;\
 706                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 707                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 708                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 709                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 710
 711                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 712             }
 713         }
 714         if(topright_ref == PART_NOT_AVAILABLE
 715            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 716            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 717             if(!MB_FIELD
 718                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 720             }
 721             if(MB_FIELD
 722                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 723                && i >= scan8[0]+8){
 724                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 725                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 726             }
 727         }
 728 #undef SET_DIAG_MV
 729     }
 730
 731     if(topright_ref != PART_NOT_AVAILABLE){
 732         *C= h->mv_cache[list][ i - 8 + part_width ];
 733         return topright_ref;
 734     }else{
 735         tprintf(s->avctx, "topright MV not available\n");
 736
 737         *C= h->mv_cache[list][ i - 8 - 1 ];
 738         return h->ref_cache[list][ i - 8 - 1 ];
 739     }
 740 }
 741
 742 /**
 743  * gets the predicted MV.
 744  * @param n the block index
 745  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 746  * @param mx the x component of the predicted motion vector
 747  * @param my the y component of the predicted motion vector
 748  */
 749 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 750     const int index8= scan8[n];
 751     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 752     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 753     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 754     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 755     const int16_t * C;
 756     int diagonal_ref, match_count;
 757
 758     assert(part_width==1 || part_width==2 || part_width==4);
 759
 760 /* mv_cache
 761   B . . A T T T T
 762   U . . L . . , .
 763   U . . L . . . .
 764   U . . L . . , .
 765   . . . L . . . .
 766 */
 767
 768     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 769     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 770     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 771     if(match_count > 1){ //most common
 772         *mx= mid_pred(A[0], B[0], C[0]);
 773         *my= mid_pred(A[1], B[1], C[1]);
 774     }else if(match_count==1){
 775         if(left_ref==ref){
 776             *mx= A[0];
 777             *my= A[1];
 778         }else if(top_ref==ref){
 779             *mx= B[0];
 780             *my= B[1];
 781         }else{
 782             *mx= C[0];
 783             *my= C[1];
 784         }
 785     }else{
 786         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 787             *mx= A[0];
 788             *my= A[1];
 789         }else{
 790             *mx= mid_pred(A[0], B[0], C[0]);
 791             *my= mid_pred(A[1], B[1], C[1]);
 792         }
 793     }
 794
 795     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 796 }
 797
 798 /**
 799  * gets the directionally predicted 16x8 MV.
 800  * @param n the block index
 801  * @param mx the x component of the predicted motion vector
 802  * @param my the y component of the predicted motion vector
 803  */
 804 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 805     if(n==0){
 806         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 807         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 808
 809         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 810
 811         if(top_ref == ref){
 812             *mx= B[0];
 813             *my= B[1];
 814             return;
 815         }
 816     }else{
 817         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 818         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 819
 820         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 821
 822         if(left_ref == ref){
 823             *mx= A[0];
 824             *my= A[1];
 825             return;
 826         }
 827     }
 828
 829     //RARE
 830     pred_motion(h, n, 4, list, ref, mx, my);
 831 }
 832
 833 /**
 834  * gets the directionally predicted 8x16 MV.
 835  * @param n the block index
 836  * @param mx the x component of the predicted motion vector
 837  * @param my the y component of the predicted motion vector
 838  */
 839 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 840     if(n==0){
 841         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 842         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 843
 844         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 845
 846         if(left_ref == ref){
 847             *mx= A[0];
 848             *my= A[1];
 849             return;
 850         }
 851     }else{
 852         const int16_t * C;
 853         int diagonal_ref;
 854
 855         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 856
 857         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 858
 859         if(diagonal_ref == ref){
 860             *mx= C[0];
 861             *my= C[1];
 862             return;
 863         }
 864     }
 865
 866     //RARE
 867     pred_motion(h, n, 2, list, ref, mx, my);
 868 }
 869
 870 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 871     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 872     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 873
 874     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 875
 876     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 877        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 878        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 879
 880         *mx = *my = 0;
 881         return;
 882     }
 883
 884     pred_motion(h, 0, 4, 0, 0, mx, my);
 885
 886     return;
 887 }
 888
 889 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 890     int poc0 = h->ref_list[0][i].poc;
 891     int td = av_clip(poc1 - poc0, -128, 127);
 892     if(td == 0 || h->ref_list[0][i].long_ref){
 893         return 256;
 894     }else{
 895         int tb = av_clip(poc - poc0, -128, 127);
 896         int tx = (16384 + (FFABS(td) >> 1)) / td;
 897         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 898     }
 899 }
 900
 901 static inline void direct_dist_scale_factor(H264Context * const h){
 902     MpegEncContext * const s = &h->s;
 903     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 904     const int poc1 = h->ref_list[1][0].poc;
 905     int i, field;
 906     for(field=0; field<2; field++){
 907         const int poc  = h->s.current_picture_ptr->field_poc[field];
 908         const int poc1 = h->ref_list[1][0].field_poc[field];
 909         for(i=0; i < 2*h->ref_count[0]; i++)
 910             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 911     }
 912
 913     for(i=0; i<h->ref_count[0]; i++){
 914         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 915     }
 916 }
 917
 918 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     int j, old_ref, rfield;
 922     int start= mbafi ? 16                      : 0;
 923     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 924     int interl= mbafi || s->picture_structure != PICT_FRAME;
 925
 926     /* bogus; fills in for missing frames */
 927     memset(map[list], 0, sizeof(map[list]));
 928
 929     for(rfield=0; rfield<2; rfield++){
 930         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 931             int poc = ref1->ref_poc[colfield][list][old_ref];
 932
 933             if     (!interl)
 934                 poc |= 3;
 935             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 936                 poc= (poc&~3) + rfield + 1;
 937
 938             for(j=start; j<end; j++){
 939                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 940                     int cur_ref= mbafi ? (j-16)^field : j;
 941                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 942                     if(rfield == field)
 943                         map[list][old_ref] = cur_ref;
 944                     break;
 945                 }
 946             }
 947         }
 948     }
 949 }
 950
 951 static inline void direct_ref_list_init(H264Context * const h){
 952     MpegEncContext * const s = &h->s;
 953     Picture * const ref1 = &h->ref_list[1][0];
 954     Picture * const cur = s->current_picture_ptr;
 955     int list, j, field;
 956     int sidx= (s->picture_structure&1)^1;
 957     int ref1sidx= (ref1->reference&1)^1;
 958
 959     for(list=0; list<2; list++){
 960         cur->ref_count[sidx][list] = h->ref_count[list];
 961         for(j=0; j<h->ref_count[list]; j++)
 962             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 963     }
 964
 965     if(s->picture_structure == PICT_FRAME){
 966         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 967         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 968     }
 969
 970     cur->mbaff= FRAME_MBAFF;
 971
 972     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 973         return;
 974
 975     for(list=0; list<2; list++){
 976         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 977         for(field=0; field<2; field++)
 978             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 979     }
 980 }
 981
 982 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 983     MpegEncContext * const s = &h->s;
 984     int b8_stride = h->b8_stride;
 985     int b4_stride = h->b_stride;
 986     int mb_xy = h->mb_xy;
 987     int mb_type_col[2];
 988     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 989     const int8_t *l1ref0, *l1ref1;
 990     const int is_b8x8 = IS_8X8(*mb_type);
 991     unsigned int sub_mb_type;
 992     int i8, i4;
 993
 994 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 995
 996     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 997         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
 998             int cur_poc = s->current_picture_ptr->poc;
 999             int *col_poc = h->ref_list[1]->field_poc;
1000             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1001             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1002             b8_stride = 0;
1003         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1004             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1005             mb_xy += s->mb_stride*fieldoff;
1006         }
1007         goto single_col;
1008     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1009         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1010             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1011             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1012             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1013             b8_stride *= 3;
1014             b4_stride *= 6;
1015             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1016             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1017                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1018                 && !is_b8x8){
1019                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1020                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1021             }else{
1022                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1023                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1024             }
1025         }else{                                           //     AFR/FR    -> AFR/FR
1026 single_col:
1027             mb_type_col[0] =
1028             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1029             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1030                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1031                 * so we know exactly what block size to use */
1032                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1033                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1034             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1035                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1036                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1037             }else{
1038                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1039                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1040             }
1041         }
1042     }
1043
1044     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1045     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1046     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1047     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1048     if(!b8_stride){
1049         if(s->mb_y&1){
1050             l1ref0 += h->b8_stride;
1051             l1ref1 += h->b8_stride;
1052             l1mv0  +=  2*b4_stride;
1053             l1mv1  +=  2*b4_stride;
1054         }
1055     }
1056
1057     if(h->direct_spatial_mv_pred){
1058         int ref[2];
1059         int mv[2][2];
1060         int list;
1061
1062         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1063
1064         /* ref = min(neighbors) */
1065         for(list=0; list<2; list++){
1066             int refa = h->ref_cache[list][scan8[0] - 1];
1067             int refb = h->ref_cache[list][scan8[0] - 8];
1068             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1069             if(refc == PART_NOT_AVAILABLE)
1070                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1071             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1072             if(ref[list] < 0)
1073                 ref[list] = -1;
1074         }
1075
1076         if(ref[0] < 0 && ref[1] < 0){
1077             ref[0] = ref[1] = 0;
1078             mv[0][0] = mv[0][1] =
1079             mv[1][0] = mv[1][1] = 0;
1080         }else{
1081             for(list=0; list<2; list++){
1082                 if(ref[list] >= 0)
1083                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1084                 else
1085                     mv[list][0] = mv[list][1] = 0;
1086             }
1087         }
1088
1089         if(ref[1] < 0){
1090             if(!is_b8x8)
1091                 *mb_type &= ~MB_TYPE_L1;
1092             sub_mb_type &= ~MB_TYPE_L1;
1093         }else if(ref[0] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L0;
1096             sub_mb_type &= ~MB_TYPE_L0;
1097         }
1098
1099         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1100             for(i8=0; i8<4; i8++){
1101                 int x8 = i8&1;
1102                 int y8 = i8>>1;
1103                 int xy8 = x8+y8*b8_stride;
1104                 int xy4 = 3*x8+y8*b4_stride;
1105                 int a=0, b=0;
1106
1107                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1108                     continue;
1109                 h->sub_mb_type[i8] = sub_mb_type;
1110
1111                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1112                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1113                 if(!IS_INTRA(mb_type_col[y8])
1114                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1115                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1116                     if(ref[0] > 0)
1117                         a= pack16to32(mv[0][0],mv[0][1]);
1118                     if(ref[1] > 0)
1119                         b= pack16to32(mv[1][0],mv[1][1]);
1120                 }else{
1121                     a= pack16to32(mv[0][0],mv[0][1]);
1122                     b= pack16to32(mv[1][0],mv[1][1]);
1123                 }
1124                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1125                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1126             }
1127         }else if(IS_16X16(*mb_type)){
1128             int a=0, b=0;
1129
1130             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1131             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1132             if(!IS_INTRA(mb_type_col[0])
1133                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1134                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1135                        && (h->x264_build>33 || !h->x264_build)))){
1136                 if(ref[0] > 0)
1137                     a= pack16to32(mv[0][0],mv[0][1]);
1138                 if(ref[1] > 0)
1139                     b= pack16to32(mv[1][0],mv[1][1]);
1140             }else{
1141                 a= pack16to32(mv[0][0],mv[0][1]);
1142                 b= pack16to32(mv[1][0],mv[1][1]);
1143             }
1144             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1145             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1146         }else{
1147             for(i8=0; i8<4; i8++){
1148                 const int x8 = i8&1;
1149                 const int y8 = i8>>1;
1150
1151                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1152                     continue;
1153                 h->sub_mb_type[i8] = sub_mb_type;
1154
1155                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1156                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1157                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1158                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1159
1160                 /* col_zero_flag */
1161                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1162                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1163                                                   && (h->x264_build>33 || !h->x264_build)))){
1164                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1165                     if(IS_SUB_8X8(sub_mb_type)){
1166                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1167                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1168                             if(ref[0] == 0)
1169                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1170                             if(ref[1] == 0)
1171                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1172                         }
1173                     }else
1174                     for(i4=0; i4<4; i4++){
1175                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1176                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1177                             if(ref[0] == 0)
1178                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1179                             if(ref[1] == 0)
1180                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1181                         }
1182                     }
1183                 }
1184             }
1185         }
1186     }else{ /* direct temporal mv pred */
1187         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1188         const int *dist_scale_factor = h->dist_scale_factor;
1189         int ref_offset= 0;
1190
1191         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1192             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1193             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1194             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1195         }
1196         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1197             ref_offset += 16;
1198
1199         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1200             /* FIXME assumes direct_8x8_inference == 1 */
1201             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1202
1203             for(i8=0; i8<4; i8++){
1204                 const int x8 = i8&1;
1205                 const int y8 = i8>>1;
1206                 int ref0, scale;
1207                 const int16_t (*l1mv)[2]= l1mv0;
1208
1209                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1210                     continue;
1211                 h->sub_mb_type[i8] = sub_mb_type;
1212
1213                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1214                 if(IS_INTRA(mb_type_col[y8])){
1215                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1216                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1217                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1218                     continue;
1219                 }
1220
1221                 ref0 = l1ref0[x8 + y8*b8_stride];
1222                 if(ref0 >= 0)
1223                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1224                 else{
1225                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1226                     l1mv= l1mv1;
1227                 }
1228                 scale = dist_scale_factor[ref0];
1229                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1230
1231                 {
1232                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1233                     int my_col = (mv_col[1]<<y_shift)/2;
1234                     int mx = (scale * mv_col[0] + 128) >> 8;
1235                     int my = (scale * my_col + 128) >> 8;
1236                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1237                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1238                 }
1239             }
1240             return;
1241         }
1242
1243         /* one-to-one mv scaling */
1244
1245         if(IS_16X16(*mb_type)){
1246             int ref, mv0, mv1;
1247
1248             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1249             if(IS_INTRA(mb_type_col[0])){
1250                 ref=mv0=mv1=0;
1251             }else{
1252                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1253                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1254                 const int scale = dist_scale_factor[ref0];
1255                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1256                 int mv_l0[2];
1257                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1258                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1259                 ref= ref0;
1260                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1261                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1262             }
1263             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1264             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1265             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1266         }else{
1267             for(i8=0; i8<4; i8++){
1268                 const int x8 = i8&1;
1269                 const int y8 = i8>>1;
1270                 int ref0, scale;
1271                 const int16_t (*l1mv)[2]= l1mv0;
1272
1273                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1274                     continue;
1275                 h->sub_mb_type[i8] = sub_mb_type;
1276                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1277                 if(IS_INTRA(mb_type_col[0])){
1278                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1279                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1280                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1281                     continue;
1282                 }
1283
1284                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1285                 if(ref0 >= 0)
1286                     ref0 = map_col_to_list0[0][ref0];
1287                 else{
1288                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1289                     l1mv= l1mv1;
1290                 }
1291                 scale = dist_scale_factor[ref0];
1292
1293                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1294                 if(IS_SUB_8X8(sub_mb_type)){
1295                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1296                     int mx = (scale * mv_col[0] + 128) >> 8;
1297                     int my = (scale * mv_col[1] + 128) >> 8;
1298                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1299                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1300                 }else
1301                 for(i4=0; i4<4; i4++){
1302                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1303                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1304                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1305                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1306                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1307                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1308                 }
1309             }
1310         }
1311     }
1312 }
1313
1314 static inline void write_back_motion(H264Context *h, int mb_type){
1315     MpegEncContext * const s = &h->s;
1316     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1317     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1318     int list;
1319
1320     if(!USES_LIST(mb_type, 0))
1321         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1322
1323     for(list=0; list<h->list_count; list++){
1324         int y;
1325         if(!USES_LIST(mb_type, list))
1326             continue;
1327
1328         for(y=0; y<4; y++){
1329             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1330             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1331         }
1332         if( h->pps.cabac ) {
1333             if(IS_SKIP(mb_type))
1334                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1335             else
1336             for(y=0; y<4; y++){
1337                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1338                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1339             }
1340         }
1341
1342         {
1343             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1344             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1345             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1346             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1347             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1348         }
1349     }
1350
1351     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1352         if(IS_8X8(mb_type)){
1353             uint8_t *direct_table = &h->direct_table[b8_xy];
1354             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1355             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1356             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1357         }
1358     }
1359 }
1360
1361 /**
1362  * Decodes a network abstraction layer unit.
1363  * @param consumed is the number of bytes used as input
1364  * @param length is the length of the array
1365  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1366  * @returns decoded bytes, might be src+1 if no escapes
1367  */
1368 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1369     int i, si, di;
1370     uint8_t *dst;
1371     int bufidx;
1372
1373 //    src[0]&0x80;                //forbidden bit
1374     h->nal_ref_idc= src[0]>>5;
1375     h->nal_unit_type= src[0]&0x1F;
1376
1377     src++; length--;
1378 #if 0
1379     for(i=0; i<length; i++)
1380         printf("%2X ", src[i]);
1381 #endif
1382
1383 #ifdef HAVE_FAST_UNALIGNED
1384 # ifdef HAVE_FAST_64BIT
1385 #   define RS 7
1386     for(i=0; i+1<length; i+=9){
1387         if(!((~*(uint64_t*)(src+i) & (*(uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1388 # else
1389 #   define RS 3
1390     for(i=0; i+1<length; i+=5){
1391         if(!((~*(uint32_t*)(src+i) & (*(uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1392 # endif
1393             continue;
1394         if(i>0 && !src[i]) i--;
1395         while(src[i]) i++;
1396 #else
1397 #   define RS 0
1398     for(i=0; i+1<length; i+=2){
1399         if(src[i]) continue;
1400         if(i>0 && src[i-1]==0) i--;
1401 #endif
1402         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1403             if(src[i+2]!=3){
1404                 /* startcode, so we must be past the end */
1405                 length=i;
1406             }
1407             break;
1408         }
1409         i-= RS;
1410     }
1411
1412     if(i>=length-1){ //no escaped 0
1413         *dst_length= length;
1414         *consumed= length+1; //+1 for the header
1415         return src;
1416     }
1417
1418     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1419     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1420     dst= h->rbsp_buffer[bufidx];
1421
1422     if (dst == NULL){
1423         return NULL;
1424     }
1425
1426 //printf("decoding esc\n");
1427     memcpy(dst, src, i);
1428     si=di=i;
1429     while(si+2<length){
1430         //remove escapes (very rare 1:2^22)
1431         if(src[si+2]>3){
1432             dst[di++]= src[si++];
1433             dst[di++]= src[si++];
1434         }else if(src[si]==0 && src[si+1]==0){
1435             if(src[si+2]==3){ //escape
1436                 dst[di++]= 0;
1437                 dst[di++]= 0;
1438                 si+=3;
1439                 continue;
1440             }else //next start code
1441                 goto nsc;
1442         }
1443
1444         dst[di++]= src[si++];
1445     }
1446     while(si<length)
1447         dst[di++]= src[si++];
1448 nsc:
1449
1450     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1451
1452     *dst_length= di;
1453     *consumed= si + 1;//+1 for the header
1454 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1455     return dst;
1456 }
1457
1458 /**
1459  * identifies the exact end of the bitstream
1460  * @return the length of the trailing, or 0 if damaged
1461  */
1462 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1463     int v= *src;
1464     int r;
1465
1466     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1467
1468     for(r=1; r<9; r++){
1469         if(v&1) return r;
1470         v>>=1;
1471     }
1472     return 0;
1473 }
1474
1475 /**
1476  * IDCT transforms the 16 dc values and dequantizes them.
1477  * @param qp quantization parameter
1478  */
1479 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1480 #define stride 16
1481     int i;
1482     int temp[16]; //FIXME check if this is a good idea
1483     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1484     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1485
1486 //memset(block, 64, 2*256);
1487 //return;
1488     for(i=0; i<4; i++){
1489         const int offset= y_offset[i];
1490         const int z0= block[offset+stride*0] + block[offset+stride*4];
1491         const int z1= block[offset+stride*0] - block[offset+stride*4];
1492         const int z2= block[offset+stride*1] - block[offset+stride*5];
1493         const int z3= block[offset+stride*1] + block[offset+stride*5];
1494
1495         temp[4*i+0]= z0+z3;
1496         temp[4*i+1]= z1+z2;
1497         temp[4*i+2]= z1-z2;
1498         temp[4*i+3]= z0-z3;
1499     }
1500
1501     for(i=0; i<4; i++){
1502         const int offset= x_offset[i];
1503         const int z0= temp[4*0+i] + temp[4*2+i];
1504         const int z1= temp[4*0+i] - temp[4*2+i];
1505         const int z2= temp[4*1+i] - temp[4*3+i];
1506         const int z3= temp[4*1+i] + temp[4*3+i];
1507
1508         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1509         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1510         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1511         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1512     }
1513 }
1514
1515 #if 0
1516 /**
1517  * DCT transforms the 16 dc values.
1518  * @param qp quantization parameter ??? FIXME
1519  */
1520 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1521 //    const int qmul= dequant_coeff[qp][0];
1522     int i;
1523     int temp[16]; //FIXME check if this is a good idea
1524     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1525     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1526
1527     for(i=0; i<4; i++){
1528         const int offset= y_offset[i];
1529         const int z0= block[offset+stride*0] + block[offset+stride*4];
1530         const int z1= block[offset+stride*0] - block[offset+stride*4];
1531         const int z2= block[offset+stride*1] - block[offset+stride*5];
1532         const int z3= block[offset+stride*1] + block[offset+stride*5];
1533
1534         temp[4*i+0]= z0+z3;
1535         temp[4*i+1]= z1+z2;
1536         temp[4*i+2]= z1-z2;
1537         temp[4*i+3]= z0-z3;
1538     }
1539
1540     for(i=0; i<4; i++){
1541         const int offset= x_offset[i];
1542         const int z0= temp[4*0+i] + temp[4*2+i];
1543         const int z1= temp[4*0+i] - temp[4*2+i];
1544         const int z2= temp[4*1+i] - temp[4*3+i];
1545         const int z3= temp[4*1+i] + temp[4*3+i];
1546
1547         block[stride*0 +offset]= (z0 + z3)>>1;
1548         block[stride*2 +offset]= (z1 + z2)>>1;
1549         block[stride*8 +offset]= (z1 - z2)>>1;
1550         block[stride*10+offset]= (z0 - z3)>>1;
1551     }
1552 }
1553 #endif
1554
1555 #undef xStride
1556 #undef stride
1557
1558 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1559     const int stride= 16*2;
1560     const int xStride= 16;
1561     int a,b,c,d,e;
1562
1563     a= block[stride*0 + xStride*0];
1564     b= block[stride*0 + xStride*1];
1565     c= block[stride*1 + xStride*0];
1566     d= block[stride*1 + xStride*1];
1567
1568     e= a-b;
1569     a= a+b;
1570     b= c-d;
1571     c= c+d;
1572
1573     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1574     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1575     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1576     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1577 }
1578
1579 #if 0
1580 static void chroma_dc_dct_c(DCTELEM *block){
1581     const int stride= 16*2;
1582     const int xStride= 16;
1583     int a,b,c,d,e;
1584
1585     a= block[stride*0 + xStride*0];
1586     b= block[stride*0 + xStride*1];
1587     c= block[stride*1 + xStride*0];
1588     d= block[stride*1 + xStride*1];
1589
1590     e= a-b;
1591     a= a+b;
1592     b= c-d;
1593     c= c+d;
1594
1595     block[stride*0 + xStride*0]= (a+c);
1596     block[stride*0 + xStride*1]= (e+b);
1597     block[stride*1 + xStride*0]= (a-c);
1598     block[stride*1 + xStride*1]= (e-b);
1599 }
1600 #endif
1601
1602 /**
1603  * gets the chroma qp.
1604  */
1605 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1606     return h->pps.chroma_qp_table[t][qscale];
1607 }
1608
1609 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1610                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1611                            int src_x_offset, int src_y_offset,
1612                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1613     MpegEncContext * const s = &h->s;
1614     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1615     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1616     const int luma_xy= (mx&3) + ((my&3)<<2);
1617     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1618     uint8_t * src_cb, * src_cr;
1619     int extra_width= h->emu_edge_width;
1620     int extra_height= h->emu_edge_height;
1621     int emu=0;
1622     const int full_mx= mx>>2;
1623     const int full_my= my>>2;
1624     const int pic_width  = 16*s->mb_width;
1625     const int pic_height = 16*s->mb_height >> MB_FIELD;
1626
1627     if(mx&7) extra_width -= 3;
1628     if(my&7) extra_height -= 3;
1629
1630     if(   full_mx < 0-extra_width
1631        || full_my < 0-extra_height
1632        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1633        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1634         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1635             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1636         emu=1;
1637     }
1638
1639     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1640     if(!square){
1641         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1642     }
1643
1644     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1645
1646     if(MB_FIELD){
1647         // chroma offset when predicting from a field of opposite parity
1648         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1649         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1650     }
1651     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1652     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1653
1654     if(emu){
1655         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1656             src_cb= s->edge_emu_buffer;
1657     }
1658     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1659
1660     if(emu){
1661         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1662             src_cr= s->edge_emu_buffer;
1663     }
1664     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1665 }
1666
1667 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1668                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1669                            int x_offset, int y_offset,
1670                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1671                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1672                            int list0, int list1){
1673     MpegEncContext * const s = &h->s;
1674     qpel_mc_func *qpix_op=  qpix_put;
1675     h264_chroma_mc_func chroma_op= chroma_put;
1676
1677     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1678     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1679     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1680     x_offset += 8*s->mb_x;
1681     y_offset += 8*(s->mb_y >> MB_FIELD);
1682
1683     if(list0){
1684         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1685         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1686                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1687                            qpix_op, chroma_op);
1688
1689         qpix_op=  qpix_avg;
1690         chroma_op= chroma_avg;
1691     }
1692
1693     if(list1){
1694         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1695         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1696                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1697                            qpix_op, chroma_op);
1698     }
1699 }
1700
1701 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1702                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1703                            int x_offset, int y_offset,
1704                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1705                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1706                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1707                            int list0, int list1){
1708     MpegEncContext * const s = &h->s;
1709
1710     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1711     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1712     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1713     x_offset += 8*s->mb_x;
1714     y_offset += 8*(s->mb_y >> MB_FIELD);
1715
1716     if(list0 && list1){
1717         /* don't optimize for luma-only case, since B-frames usually
1718          * use implicit weights => chroma too. */
1719         uint8_t *tmp_cb = s->obmc_scratchpad;
1720         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1721         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1722         int refn0 = h->ref_cache[0][ scan8[n] ];
1723         int refn1 = h->ref_cache[1][ scan8[n] ];
1724
1725         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1726                     dest_y, dest_cb, dest_cr,
1727                     x_offset, y_offset, qpix_put, chroma_put);
1728         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1729                     tmp_y, tmp_cb, tmp_cr,
1730                     x_offset, y_offset, qpix_put, chroma_put);
1731
1732         if(h->use_weight == 2){
1733             int weight0 = h->implicit_weight[refn0][refn1];
1734             int weight1 = 64 - weight0;
1735             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1736             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1737             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1738         }else{
1739             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1740                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1741                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1742             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1743                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1744                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1745             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1746                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1747                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1748         }
1749     }else{
1750         int list = list1 ? 1 : 0;
1751         int refn = h->ref_cache[list][ scan8[n] ];
1752         Picture *ref= &h->ref_list[list][refn];
1753         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1754                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1755                     qpix_put, chroma_put);
1756
1757         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1758                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1759         if(h->use_weight_chroma){
1760             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1761                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1762             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1763                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1764         }
1765     }
1766 }
1767
1768 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1769                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1770                            int x_offset, int y_offset,
1771                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1772                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1773                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1774                            int list0, int list1){
1775     if((h->use_weight==2 && list0 && list1
1776         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1777        || h->use_weight==1)
1778         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1779                          x_offset, y_offset, qpix_put, chroma_put,
1780                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1781     else
1782         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1783                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1784 }
1785
1786 static inline void prefetch_motion(H264Context *h, int list){
1787     /* fetch pixels for estimated mv 4 macroblocks ahead
1788      * optimized for 64byte cache lines */
1789     MpegEncContext * const s = &h->s;
1790     const int refn = h->ref_cache[list][scan8[0]];
1791     if(refn >= 0){
1792         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1793         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1794         uint8_t **src= h->ref_list[list][refn].data;
1795         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1796         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1797         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1798         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1799     }
1800 }
1801
1802 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1803                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1804                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1805                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1806     MpegEncContext * const s = &h->s;
1807     const int mb_xy= h->mb_xy;
1808     const int mb_type= s->current_picture.mb_type[mb_xy];
1809
1810     assert(IS_INTER(mb_type));
1811
1812     prefetch_motion(h, 0);
1813
1814     if(IS_16X16(mb_type)){
1815         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1816                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1817                 &weight_op[0], &weight_avg[0],
1818                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1819     }else if(IS_16X8(mb_type)){
1820         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1821                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1822                 &weight_op[1], &weight_avg[1],
1823                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1824         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1825                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1826                 &weight_op[1], &weight_avg[1],
1827                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1828     }else if(IS_8X16(mb_type)){
1829         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1830                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1831                 &weight_op[2], &weight_avg[2],
1832                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1833         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1834                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1835                 &weight_op[2], &weight_avg[2],
1836                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1837     }else{
1838         int i;
1839
1840         assert(IS_8X8(mb_type));
1841
1842         for(i=0; i<4; i++){
1843             const int sub_mb_type= h->sub_mb_type[i];
1844             const int n= 4*i;
1845             int x_offset= (i&1)<<2;
1846             int y_offset= (i&2)<<1;
1847
1848             if(IS_SUB_8X8(sub_mb_type)){
1849                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1850                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1851                     &weight_op[3], &weight_avg[3],
1852                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1853             }else if(IS_SUB_8X4(sub_mb_type)){
1854                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1855                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1856                     &weight_op[4], &weight_avg[4],
1857                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1858                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1859                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1860                     &weight_op[4], &weight_avg[4],
1861                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862             }else if(IS_SUB_4X8(sub_mb_type)){
1863                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1864                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1865                     &weight_op[5], &weight_avg[5],
1866                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1867                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1868                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1869                     &weight_op[5], &weight_avg[5],
1870                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1871             }else{
1872                 int j;
1873                 assert(IS_SUB_4X4(sub_mb_type));
1874                 for(j=0; j<4; j++){
1875                     int sub_x_offset= x_offset + 2*(j&1);
1876                     int sub_y_offset= y_offset +   (j&2);
1877                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1878                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1879                         &weight_op[6], &weight_avg[6],
1880                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1881                 }
1882             }
1883         }
1884     }
1885
1886     prefetch_motion(h, 1);
1887 }
1888
1889 static av_cold void init_cavlc_level_tab(void){
1890     int suffix_length, mask;
1891     unsigned int i;
1892
1893     for(suffix_length=0; suffix_length<7; suffix_length++){
1894         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1895             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1896             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1897
1898             mask= -(level_code&1);
1899             level_code= (((2+level_code)>>1) ^ mask) - mask;
1900             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1901                 cavlc_level_tab[suffix_length][i][0]= level_code;
1902                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1903             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1904                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1905                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1906             }else{
1907                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1908                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1909             }
1910         }
1911     }
1912 }
1913
1914 static av_cold void decode_init_vlc(void){
1915     static int done = 0;
1916
1917     if (!done) {
1918         int i;
1919         int offset;
1920         done = 1;
1921
1922         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1923         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1924         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1925                  &chroma_dc_coeff_token_len [0], 1, 1,
1926                  &chroma_dc_coeff_token_bits[0], 1, 1,
1927                  INIT_VLC_USE_NEW_STATIC);
1928
1929         offset = 0;
1930         for(i=0; i<4; i++){
1931             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1932             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1933             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1934                      &coeff_token_len [i][0], 1, 1,
1935                      &coeff_token_bits[i][0], 1, 1,
1936                      INIT_VLC_USE_NEW_STATIC);
1937             offset += coeff_token_vlc_tables_size[i];
1938         }
1939         /*
1940          * This is a one time safety check to make sure that
1941          * the packed static coeff_token_vlc table sizes
1942          * were initialized correctly.
1943          */
1944         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1945
1946         for(i=0; i<3; i++){
1947             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1948             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1949             init_vlc(&chroma_dc_total_zeros_vlc[i],
1950                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1951                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1952                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1953                      INIT_VLC_USE_NEW_STATIC);
1954         }
1955         for(i=0; i<15; i++){
1956             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1957             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1958             init_vlc(&total_zeros_vlc[i],
1959                      TOTAL_ZEROS_VLC_BITS, 16,
1960                      &total_zeros_len [i][0], 1, 1,
1961                      &total_zeros_bits[i][0], 1, 1,
1962                      INIT_VLC_USE_NEW_STATIC);
1963         }
1964
1965         for(i=0; i<6; i++){
1966             run_vlc[i].table = run_vlc_tables[i];
1967             run_vlc[i].table_allocated = run_vlc_tables_size;
1968             init_vlc(&run_vlc[i],
1969                      RUN_VLC_BITS, 7,
1970                      &run_len [i][0], 1, 1,
1971                      &run_bits[i][0], 1, 1,
1972                      INIT_VLC_USE_NEW_STATIC);
1973         }
1974         run7_vlc.table = run7_vlc_table,
1975         run7_vlc.table_allocated = run7_vlc_table_size;
1976         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1977                  &run_len [6][0], 1, 1,
1978                  &run_bits[6][0], 1, 1,
1979                  INIT_VLC_USE_NEW_STATIC);
1980
1981         init_cavlc_level_tab();
1982     }
1983 }
1984
1985 static void free_tables(H264Context *h){
1986     int i;
1987     H264Context *hx;
1988     av_freep(&h->intra4x4_pred_mode);
1989     av_freep(&h->chroma_pred_mode_table);
1990     av_freep(&h->cbp_table);
1991     av_freep(&h->mvd_table[0]);
1992     av_freep(&h->mvd_table[1]);
1993     av_freep(&h->direct_table);
1994     av_freep(&h->non_zero_count);
1995     av_freep(&h->slice_table_base);
1996     h->slice_table= NULL;
1997
1998     av_freep(&h->mb2b_xy);
1999     av_freep(&h->mb2b8_xy);
2000
2001     for(i = 0; i < h->s.avctx->thread_count; i++) {
2002         hx = h->thread_context[i];
2003         if(!hx) continue;
2004         av_freep(&hx->top_borders[1]);
2005         av_freep(&hx->top_borders[0]);
2006         av_freep(&hx->s.obmc_scratchpad);
2007     }
2008 }
2009
2010 static void init_dequant8_coeff_table(H264Context *h){
2011     int i,q,x;
2012     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2013     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2014     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2015
2016     for(i=0; i<2; i++ ){
2017         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2018             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2019             break;
2020         }
2021
2022         for(q=0; q<52; q++){
2023             int shift = div6[q];
2024             int idx = rem6[q];
2025             for(x=0; x<64; x++)
2026                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2027                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2028                     h->pps.scaling_matrix8[i][x]) << shift;
2029         }
2030     }
2031 }
2032
2033 static void init_dequant4_coeff_table(H264Context *h){
2034     int i,j,q,x;
2035     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2036     for(i=0; i<6; i++ ){
2037         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2038         for(j=0; j<i; j++){
2039             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2040                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2041                 break;
2042             }
2043         }
2044         if(j<i)
2045             continue;
2046
2047         for(q=0; q<52; q++){
2048             int shift = div6[q] + 2;
2049             int idx = rem6[q];
2050             for(x=0; x<16; x++)
2051                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2052                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2053                     h->pps.scaling_matrix4[i][x]) << shift;
2054         }
2055     }
2056 }
2057
2058 static void init_dequant_tables(H264Context *h){
2059     int i,x;
2060     init_dequant4_coeff_table(h);
2061     if(h->pps.transform_8x8_mode)
2062         init_dequant8_coeff_table(h);
2063     if(h->sps.transform_bypass){
2064         for(i=0; i<6; i++)
2065             for(x=0; x<16; x++)
2066                 h->dequant4_coeff[i][0][x] = 1<<6;
2067         if(h->pps.transform_8x8_mode)
2068             for(i=0; i<2; i++)
2069                 for(x=0; x<64; x++)
2070                     h->dequant8_coeff[i][0][x] = 1<<6;
2071     }
2072 }
2073
2074
2075 /**
2076  * allocates tables.
2077  * needs width/height
2078  */
2079 static int alloc_tables(H264Context *h){
2080     MpegEncContext * const s = &h->s;
2081     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2082     int x,y;
2083
2084     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2085
2086     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2087     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2088     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2089
2090     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2091     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2092     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2093     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2094
2095     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2096     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2097
2098     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2099     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2100     for(y=0; y<s->mb_height; y++){
2101         for(x=0; x<s->mb_width; x++){
2102             const int mb_xy= x + y*s->mb_stride;
2103             const int b_xy = 4*x + 4*y*h->b_stride;
2104             const int b8_xy= 2*x + 2*y*h->b8_stride;
2105
2106             h->mb2b_xy [mb_xy]= b_xy;
2107             h->mb2b8_xy[mb_xy]= b8_xy;
2108         }
2109     }
2110
2111     s->obmc_scratchpad = NULL;
2112
2113     if(!h->dequant4_coeff[0])
2114         init_dequant_tables(h);
2115
2116     return 0;
2117 fail:
2118     free_tables(h);
2119     return -1;
2120 }
2121
2122 /**
2123  * Mimic alloc_tables(), but for every context thread.
2124  */
2125 static void clone_tables(H264Context *dst, H264Context *src){
2126     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2127     dst->non_zero_count           = src->non_zero_count;
2128     dst->slice_table              = src->slice_table;
2129     dst->cbp_table                = src->cbp_table;
2130     dst->mb2b_xy                  = src->mb2b_xy;
2131     dst->mb2b8_xy                 = src->mb2b8_xy;
2132     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2133     dst->mvd_table[0]             = src->mvd_table[0];
2134     dst->mvd_table[1]             = src->mvd_table[1];
2135     dst->direct_table             = src->direct_table;
2136
2137     dst->s.obmc_scratchpad = NULL;
2138     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2139 }
2140
2141 /**
2142  * Init context
2143  * Allocate buffers which are not shared amongst multiple threads.
2144  */
2145 static int context_init(H264Context *h){
2146     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2147     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2148
2149     return 0;
2150 fail:
2151     return -1; // free_tables will clean up for us
2152 }
2153
2154 static av_cold void common_init(H264Context *h){
2155     MpegEncContext * const s = &h->s;
2156
2157     s->width = s->avctx->width;
2158     s->height = s->avctx->height;
2159     s->codec_id= s->avctx->codec->id;
2160
2161     ff_h264_pred_init(&h->hpc, s->codec_id);
2162
2163     h->dequant_coeff_pps= -1;
2164     s->unrestricted_mv=1;
2165     s->decode=1; //FIXME
2166
2167     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2168
2169     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2170     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2171 }
2172
2173 static av_cold int decode_init(AVCodecContext *avctx){
2174     H264Context *h= avctx->priv_data;
2175     MpegEncContext * const s = &h->s;
2176
2177     MPV_decode_defaults(s);
2178
2179     s->avctx = avctx;
2180     common_init(h);
2181
2182     s->out_format = FMT_H264;
2183     s->workaround_bugs= avctx->workaround_bugs;
2184
2185     // set defaults
2186 //    s->decode_mb= ff_h263_decode_mb;
2187     s->quarter_sample = 1;
2188     s->low_delay= 1;
2189
2190     if(avctx->codec_id == CODEC_ID_SVQ3)
2191         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2192     else if(avctx->codec_id == CODEC_ID_H264_VDPAU)
2193         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2194     else
2195         avctx->pix_fmt= PIX_FMT_YUV420P;
2196
2197     decode_init_vlc();
2198
2199     if(avctx->extradata_size > 0 && avctx->extradata &&
2200        *(char *)avctx->extradata == 1){
2201         h->is_avc = 1;
2202         h->got_avcC = 0;
2203     } else {
2204         h->is_avc = 0;
2205     }
2206
2207     h->thread_context[0] = h;
2208     h->outputed_poc = INT_MIN;
2209     h->prev_poc_msb= 1<<16;
2210     return 0;
2211 }
2212
2213 static int frame_start(H264Context *h){
2214     MpegEncContext * const s = &h->s;
2215     int i;
2216
2217     if(MPV_frame_start(s, s->avctx) < 0)
2218         return -1;
2219     ff_er_frame_start(s);
2220     /*
2221      * MPV_frame_start uses pict_type to derive key_frame.
2222      * This is incorrect for H.264; IDR markings must be used.
2223      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2224      * See decode_nal_units().
2225      */
2226     s->current_picture_ptr->key_frame= 0;
2227
2228     assert(s->linesize && s->uvlinesize);
2229
2230     for(i=0; i<16; i++){
2231         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2232         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2233     }
2234     for(i=0; i<4; i++){
2235         h->block_offset[16+i]=
2236         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2237         h->block_offset[24+16+i]=
2238         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2239     }
2240
2241     /* can't be in alloc_tables because linesize isn't known there.
2242      * FIXME: redo bipred weight to not require extra buffer? */
2243     for(i = 0; i < s->avctx->thread_count; i++)
2244         if(!h->thread_context[i]->s.obmc_scratchpad)
2245             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2246
2247     /* some macroblocks will be accessed before they're available */
2248     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2249         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2250
2251 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2252
2253     // We mark the current picture as non-reference after allocating it, so
2254     // that if we break out due to an error it can be released automatically
2255     // in the next MPV_frame_start().
2256     // SVQ3 as well as most other codecs have only last/next/current and thus
2257     // get released even with set reference, besides SVQ3 and others do not
2258     // mark frames as reference later "naturally".
2259     if(s->codec_id != CODEC_ID_SVQ3)
2260         s->current_picture_ptr->reference= 0;
2261
2262     s->current_picture_ptr->field_poc[0]=
2263     s->current_picture_ptr->field_poc[1]= INT_MAX;
2264     assert(s->current_picture_ptr->long_ref==0);
2265
2266     return 0;
2267 }
2268
2269 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2270     MpegEncContext * const s = &h->s;
2271     int i;
2272     int step    = 1;
2273     int offset  = 1;
2274     int uvoffset= 1;
2275     int top_idx = 1;
2276     int skiplast= 0;
2277
2278     src_y  -=   linesize;
2279     src_cb -= uvlinesize;
2280     src_cr -= uvlinesize;
2281
2282     if(!simple && FRAME_MBAFF){
2283         if(s->mb_y&1){
2284             offset  = MB_MBAFF ? 1 : 17;
2285             uvoffset= MB_MBAFF ? 1 : 9;
2286             if(!MB_MBAFF){
2287                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2288                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2289                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2290                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2291                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2292                 }
2293             }
2294         }else{
2295             if(!MB_MBAFF){
2296                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2297                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2298                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2299                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2300                 }
2301                 skiplast= 1;
2302             }
2303             offset  =
2304             uvoffset=
2305             top_idx = MB_MBAFF ? 0 : 1;
2306         }
2307         step= MB_MBAFF ? 2 : 1;
2308     }
2309
2310     // There are two lines saved, the line above the the top macroblock of a pair,
2311     // and the line above the bottom macroblock
2312     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2313     for(i=1; i<17 - skiplast; i++){
2314         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2315     }
2316
2317     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2318     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2319
2320     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2321         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2322         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2323         for(i=1; i<9 - skiplast; i++){
2324             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2325             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2326         }
2327         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2328         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2329     }
2330 }
2331
2332 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2333     MpegEncContext * const s = &h->s;
2334     int temp8, i;
2335     uint64_t temp64;
2336     int deblock_left;
2337     int deblock_top;
2338     int mb_xy;
2339     int step    = 1;
2340     int offset  = 1;
2341     int uvoffset= 1;
2342     int top_idx = 1;
2343
2344     if(!simple && FRAME_MBAFF){
2345         if(s->mb_y&1){
2346             offset  = MB_MBAFF ? 1 : 17;
2347             uvoffset= MB_MBAFF ? 1 : 9;
2348         }else{
2349             offset  =
2350             uvoffset=
2351             top_idx = MB_MBAFF ? 0 : 1;
2352         }
2353         step= MB_MBAFF ? 2 : 1;
2354     }
2355
2356     if(h->deblocking_filter == 2) {
2357         mb_xy = h->mb_xy;
2358         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2359         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2360     } else {
2361         deblock_left = (s->mb_x > 0);
2362         deblock_top =  (s->mb_y > !!MB_FIELD);
2363     }
2364
2365     src_y  -=   linesize + 1;
2366     src_cb -= uvlinesize + 1;
2367     src_cr -= uvlinesize + 1;
2368
2369 #define XCHG(a,b,t,xchg)\
2370 t= a;\
2371 if(xchg)\
2372     a= b;\
2373 b= t;
2374
2375     if(deblock_left){
2376         for(i = !deblock_top; i<16; i++){
2377             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2378         }
2379         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2380     }
2381
2382     if(deblock_top){
2383         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2384         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2385         if(s->mb_x+1 < s->mb_width){
2386             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2387         }
2388     }
2389
2390     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2391         if(deblock_left){
2392             for(i = !deblock_top; i<8; i++){
2393                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2394                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2395             }
2396             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2397             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2398         }
2399         if(deblock_top){
2400             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2401             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2402         }
2403     }
2404 }
2405
2406 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2407     MpegEncContext * const s = &h->s;
2408     const int mb_x= s->mb_x;
2409     const int mb_y= s->mb_y;
2410     const int mb_xy= h->mb_xy;
2411     const int mb_type= s->current_picture.mb_type[mb_xy];
2412     uint8_t  *dest_y, *dest_cb, *dest_cr;
2413     int linesize, uvlinesize /*dct_offset*/;
2414     int i;
2415     int *block_offset = &h->block_offset[0];
2416     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2417     /* is_h264 should always be true if SVQ3 is disabled. */
2418     const int is_h264 = !ENABLE_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2419     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2420     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2421
2422     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2423     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2424     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2425
2426     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2427     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2428
2429     if (!simple && MB_FIELD) {
2430         linesize   = h->mb_linesize   = s->linesize * 2;
2431         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2432         block_offset = &h->block_offset[24];
2433         if(mb_y&1){ //FIXME move out of this function?
2434             dest_y -= s->linesize*15;
2435             dest_cb-= s->uvlinesize*7;
2436             dest_cr-= s->uvlinesize*7;
2437         }
2438         if(FRAME_MBAFF) {
2439             int list;
2440             for(list=0; list<h->list_count; list++){
2441                 if(!USES_LIST(mb_type, list))
2442                     continue;
2443                 if(IS_16X16(mb_type)){
2444                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2445                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2446                 }else{
2447                     for(i=0; i<16; i+=4){
2448                         int ref = h->ref_cache[list][scan8[i]];
2449                         if(ref >= 0)
2450                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2451                     }
2452                 }
2453             }
2454         }
2455     } else {
2456         linesize   = h->mb_linesize   = s->linesize;
2457         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2458 //        dct_offset = s->linesize * 16;
2459     }
2460
2461     if (!simple && IS_INTRA_PCM(mb_type)) {
2462         for (i=0; i<16; i++) {
2463             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2464         }
2465         for (i=0; i<8; i++) {
2466             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2467             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2468         }
2469     } else {
2470         if(IS_INTRA(mb_type)){
2471             if(h->deblocking_filter)
2472                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2473
2474             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2475                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2476                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2477             }
2478
2479             if(IS_INTRA4x4(mb_type)){
2480                 if(simple || !s->encoding){
2481                     if(IS_8x8DCT(mb_type)){
2482                         if(transform_bypass){
2483                             idct_dc_add =
2484                             idct_add    = s->dsp.add_pixels8;
2485                         }else{
2486                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2487                             idct_add    = s->dsp.h264_idct8_add;
2488                         }
2489                         for(i=0; i<16; i+=4){
2490                             uint8_t * const ptr= dest_y + block_offset[i];
2491                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2492                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2493                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2494                             }else{
2495                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2496                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2497                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2498                                 if(nnz){
2499                                     if(nnz == 1 && h->mb[i*16])
2500                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2501                                     else
2502                                         idct_add   (ptr, h->mb + i*16, linesize);
2503                                 }
2504                             }
2505                         }
2506                     }else{
2507                         if(transform_bypass){
2508                             idct_dc_add =
2509                             idct_add    = s->dsp.add_pixels4;
2510                         }else{
2511                             idct_dc_add = s->dsp.h264_idct_dc_add;
2512                             idct_add    = s->dsp.h264_idct_add;
2513                         }
2514                         for(i=0; i<16; i++){
2515                             uint8_t * const ptr= dest_y + block_offset[i];
2516                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2517
2518                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2519                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2520                             }else{
2521                                 uint8_t *topright;
2522                                 int nnz, tr;
2523                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2524                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2525                                     assert(mb_y || linesize <= block_offset[i]);
2526                                     if(!topright_avail){
2527                                         tr= ptr[3 - linesize]*0x01010101;
2528                                         topright= (uint8_t*) &tr;
2529                                     }else
2530                                         topright= ptr + 4 - linesize;
2531                                 }else
2532                                     topright= NULL;
2533
2534                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2535                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2536                                 if(nnz){
2537                                     if(is_h264){
2538                                         if(nnz == 1 && h->mb[i*16])
2539                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2540                                         else
2541                                             idct_add   (ptr, h->mb + i*16, linesize);
2542                                     }else
2543                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2544                                 }
2545                             }
2546                         }
2547                     }
2548                 }
2549             }else{
2550                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2551                 if(is_h264){
2552                     if(!transform_bypass)
2553                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2554                 }else
2555                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2556             }
2557             if(h->deblocking_filter)
2558                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2559         }else if(is_h264){
2560             hl_motion(h, dest_y, dest_cb, dest_cr,
2561                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2562                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2563                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2564         }
2565
2566
2567         if(!IS_INTRA4x4(mb_type)){
2568             if(is_h264){
2569                 if(IS_INTRA16x16(mb_type)){
2570                     if(transform_bypass){
2571                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2572                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2573                         }else{
2574                             for(i=0; i<16; i++){
2575                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2576                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2577                             }
2578                         }
2579                     }else{
2580                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2581                     }
2582                 }else if(h->cbp&15){
2583                     if(transform_bypass){
2584                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2585                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2586                         for(i=0; i<16; i+=di){
2587                             if(h->non_zero_count_cache[ scan8[i] ]){
2588                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2589                             }
2590                         }
2591                     }else{
2592                         if(IS_8x8DCT(mb_type)){
2593                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2594                         }else{
2595                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2596                         }
2597                     }
2598                 }
2599             }else{
2600                 for(i=0; i<16; i++){
2601                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2602                         uint8_t * const ptr= dest_y + block_offset[i];
2603                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2604                     }
2605                 }
2606             }
2607         }
2608
2609         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2610             uint8_t *dest[2] = {dest_cb, dest_cr};
2611             if(transform_bypass){
2612                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2613                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2614                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2615                 }else{
2616                     idct_add = s->dsp.add_pixels4;
2617                     for(i=16; i<16+8; i++){
2618                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2619                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2620                     }
2621                 }
2622             }else{
2623                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2624                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2625                 if(is_h264){
2626                     idct_add = s->dsp.h264_idct_add;
2627                     idct_dc_add = s->dsp.h264_idct_dc_add;
2628                     for(i=16; i<16+8; i++){
2629                         if(h->non_zero_count_cache[ scan8[i] ])
2630                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2631                         else if(h->mb[i*16])
2632                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2633                     }
2634                 }else{
2635                     for(i=16; i<16+8; i++){
2636                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2637                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2638                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2639                         }
2640                     }
2641                 }
2642             }
2643         }
2644     }
2645     if(h->cbp || IS_INTRA(mb_type))
2646         s->dsp.clear_blocks(h->mb);
2647
2648     if(h->deblocking_filter) {
2649         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2650         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2651         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2652         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2653         if (!simple && FRAME_MBAFF) {
2654             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2655         } else {
2656             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2657         }
2658     }
2659 }
2660
2661 /**
2662  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2663  */
2664 static void hl_decode_mb_simple(H264Context *h){
2665     hl_decode_mb_internal(h, 1);
2666 }
2667
2668 /**
2669  * Process a macroblock; this handles edge cases, such as interlacing.
2670  */
2671 static void av_noinline hl_decode_mb_complex(H264Context *h){
2672     hl_decode_mb_internal(h, 0);
2673 }
2674
2675 static void hl_decode_mb(H264Context *h){
2676     MpegEncContext * const s = &h->s;
2677     const int mb_xy= h->mb_xy;
2678     const int mb_type= s->current_picture.mb_type[mb_xy];
2679     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2680
2681     if(ENABLE_H264_ENCODER && !s->decode)
2682         return;
2683
2684     if (is_complex)
2685         hl_decode_mb_complex(h);
2686     else hl_decode_mb_simple(h);
2687 }
2688
2689 static void pic_as_field(Picture *pic, const int parity){
2690     int i;
2691     for (i = 0; i < 4; ++i) {
2692         if (parity == PICT_BOTTOM_FIELD)
2693             pic->data[i] += pic->linesize[i];
2694         pic->reference = parity;
2695         pic->linesize[i] *= 2;
2696     }
2697     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2698 }
2699
2700 static int split_field_copy(Picture *dest, Picture *src,
2701                             int parity, int id_add){
2702     int match = !!(src->reference & parity);
2703
2704     if (match) {
2705         *dest = *src;
2706         if(parity != PICT_FRAME){
2707             pic_as_field(dest, parity);
2708             dest->pic_id *= 2;
2709             dest->pic_id += id_add;
2710         }
2711     }
2712
2713     return match;
2714 }
2715
2716 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2717     int i[2]={0};
2718     int index=0;
2719
2720     while(i[0]<len || i[1]<len){
2721         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2722             i[0]++;
2723         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2724             i[1]++;
2725         if(i[0] < len){
2726             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2727             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2728         }
2729         if(i[1] < len){
2730             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2731             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2732         }
2733     }
2734
2735     return index;
2736 }
2737
2738 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2739     int i, best_poc;
2740     int out_i= 0;
2741
2742     for(;;){
2743         best_poc= dir ? INT_MIN : INT_MAX;
2744
2745         for(i=0; i<len; i++){
2746             const int poc= src[i]->poc;
2747             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2748                 best_poc= poc;
2749                 sorted[out_i]= src[i];
2750             }
2751         }
2752         if(best_poc == (dir ? INT_MIN : INT_MAX))
2753             break;
2754         limit= sorted[out_i++]->poc - dir;
2755     }
2756     return out_i;
2757 }
2758
2759 /**
2760  * fills the default_ref_list.
2761  */
2762 static int fill_default_ref_list(H264Context *h){
2763     MpegEncContext * const s = &h->s;
2764     int i, len;
2765
2766     if(h->slice_type_nos==FF_B_TYPE){
2767         Picture *sorted[32];
2768         int cur_poc, list;
2769         int lens[2];
2770
2771         if(FIELD_PICTURE)
2772             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2773         else
2774             cur_poc= s->current_picture_ptr->poc;
2775
2776         for(list= 0; list<2; list++){
2777             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2778             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2779             assert(len<=32);
2780             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2781             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2782             assert(len<=32);
2783
2784             if(len < h->ref_count[list])
2785                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2786             lens[list]= len;
2787         }
2788
2789         if(lens[0] == lens[1] && lens[1] > 1){
2790             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2791             if(i == lens[0])
2792                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2793         }
2794     }else{
2795         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2796         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2797         assert(len <= 32);
2798         if(len < h->ref_count[0])
2799             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2800     }
2801 #ifdef TRACE
2802     for (i=0; i<h->ref_count[0]; i++) {
2803         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2804     }
2805     if(h->slice_type_nos==FF_B_TYPE){
2806         for (i=0; i<h->ref_count[1]; i++) {
2807             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2808         }
2809     }
2810 #endif
2811     return 0;
2812 }
2813
2814 static void print_short_term(H264Context *h);
2815 static void print_long_term(H264Context *h);
2816
2817 /**
2818  * Extract structure information about the picture described by pic_num in
2819  * the current decoding context (frame or field). Note that pic_num is
2820  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2821  * @param pic_num picture number for which to extract structure information
2822  * @param structure one of PICT_XXX describing structure of picture
2823  *                      with pic_num
2824  * @return frame number (short term) or long term index of picture
2825  *         described by pic_num
2826  */
2827 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2828     MpegEncContext * const s = &h->s;
2829
2830     *structure = s->picture_structure;
2831     if(FIELD_PICTURE){
2832         if (!(pic_num & 1))
2833             /* opposite field */
2834             *structure ^= PICT_FRAME;
2835         pic_num >>= 1;
2836     }
2837
2838     return pic_num;
2839 }
2840
2841 static int decode_ref_pic_list_reordering(H264Context *h){
2842     MpegEncContext * const s = &h->s;
2843     int list, index, pic_structure;
2844
2845     print_short_term(h);
2846     print_long_term(h);
2847
2848     for(list=0; list<h->list_count; list++){
2849         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2850
2851         if(get_bits1(&s->gb)){
2852             int pred= h->curr_pic_num;
2853
2854             for(index=0; ; index++){
2855                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2856                 unsigned int pic_id;
2857                 int i;
2858                 Picture *ref = NULL;
2859
2860                 if(reordering_of_pic_nums_idc==3)
2861                     break;
2862
2863                 if(index >= h->ref_count[list]){
2864                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2865                     return -1;
2866                 }
2867
2868                 if(reordering_of_pic_nums_idc<3){
2869                     if(reordering_of_pic_nums_idc<2){
2870                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2871                         int frame_num;
2872
2873                         if(abs_diff_pic_num > h->max_pic_num){
2874                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2875                             return -1;
2876                         }
2877
2878                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2879                         else                                pred+= abs_diff_pic_num;
2880                         pred &= h->max_pic_num - 1;
2881
2882                         frame_num = pic_num_extract(h, pred, &pic_structure);
2883
2884                         for(i= h->short_ref_count-1; i>=0; i--){
2885                             ref = h->short_ref[i];
2886                             assert(ref->reference);
2887                             assert(!ref->long_ref);
2888                             if(
2889                                    ref->frame_num == frame_num &&
2890                                    (ref->reference & pic_structure)
2891                               )
2892                                 break;
2893                         }
2894                         if(i>=0)
2895                             ref->pic_id= pred;
2896                     }else{
2897                         int long_idx;
2898                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2899
2900                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2901
2902                         if(long_idx>31){
2903                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2904                             return -1;
2905                         }
2906                         ref = h->long_ref[long_idx];
2907                         assert(!(ref && !ref->reference));
2908                         if(ref && (ref->reference & pic_structure)){
2909                             ref->pic_id= pic_id;
2910                             assert(ref->long_ref);
2911                             i=0;
2912                         }else{
2913                             i=-1;
2914                         }
2915                     }
2916
2917                     if (i < 0) {
2918                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2919                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2920                     } else {
2921                         for(i=index; i+1<h->ref_count[list]; i++){
2922                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2923                                 break;
2924                         }
2925                         for(; i > index; i--){
2926                             h->ref_list[list][i]= h->ref_list[list][i-1];
2927                         }
2928                         h->ref_list[list][index]= *ref;
2929                         if (FIELD_PICTURE){
2930                             pic_as_field(&h->ref_list[list][index], pic_structure);
2931                         }
2932                     }
2933                 }else{
2934                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2935                     return -1;
2936                 }
2937             }
2938         }
2939     }
2940     for(list=0; list<h->list_count; list++){
2941         for(index= 0; index < h->ref_count[list]; index++){
2942             if(!h->ref_list[list][index].data[0]){
2943                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2944                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2945             }
2946         }
2947     }
2948
2949     return 0;
2950 }
2951
2952 static void fill_mbaff_ref_list(H264Context *h){
2953     int list, i, j;
2954     for(list=0; list<2; list++){ //FIXME try list_count
2955         for(i=0; i<h->ref_count[list]; i++){
2956             Picture *frame = &h->ref_list[list][i];
2957             Picture *field = &h->ref_list[list][16+2*i];
2958             field[0] = *frame;
2959             for(j=0; j<3; j++)
2960                 field[0].linesize[j] <<= 1;
2961             field[0].reference = PICT_TOP_FIELD;
2962             field[0].poc= field[0].field_poc[0];
2963             field[1] = field[0];
2964             for(j=0; j<3; j++)
2965                 field[1].data[j] += frame->linesize[j];
2966             field[1].reference = PICT_BOTTOM_FIELD;
2967             field[1].poc= field[1].field_poc[1];
2968
2969             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2970             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2971             for(j=0; j<2; j++){
2972                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2973                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2974             }
2975         }
2976     }
2977     for(j=0; j<h->ref_count[1]; j++){
2978         for(i=0; i<h->ref_count[0]; i++)
2979             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2980         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2981         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2982     }
2983 }
2984
2985 static int pred_weight_table(H264Context *h){
2986     MpegEncContext * const s = &h->s;
2987     int list, i;
2988     int luma_def, chroma_def;
2989
2990     h->use_weight= 0;
2991     h->use_weight_chroma= 0;
2992     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2993     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2994     luma_def = 1<<h->luma_log2_weight_denom;
2995     chroma_def = 1<<h->chroma_log2_weight_denom;
2996
2997     for(list=0; list<2; list++){
2998         for(i=0; i<h->ref_count[list]; i++){
2999             int luma_weight_flag, chroma_weight_flag;
3000
3001             luma_weight_flag= get_bits1(&s->gb);
3002             if(luma_weight_flag){
3003                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3004                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3005                 if(   h->luma_weight[list][i] != luma_def
3006                    || h->luma_offset[list][i] != 0)
3007                     h->use_weight= 1;
3008             }else{
3009                 h->luma_weight[list][i]= luma_def;
3010                 h->luma_offset[list][i]= 0;
3011             }
3012
3013             if(CHROMA){
3014                 chroma_weight_flag= get_bits1(&s->gb);
3015                 if(chroma_weight_flag){
3016                     int j;
3017                     for(j=0; j<2; j++){
3018                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3019                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3020                         if(   h->chroma_weight[list][i][j] != chroma_def
3021                         || h->chroma_offset[list][i][j] != 0)
3022                             h->use_weight_chroma= 1;
3023                     }
3024                 }else{
3025                     int j;
3026                     for(j=0; j<2; j++){
3027                         h->chroma_weight[list][i][j]= chroma_def;
3028                         h->chroma_offset[list][i][j]= 0;
3029                     }
3030                 }
3031             }
3032         }
3033         if(h->slice_type_nos != FF_B_TYPE) break;
3034     }
3035     h->use_weight= h->use_weight || h->use_weight_chroma;
3036     return 0;
3037 }
3038
3039 static void implicit_weight_table(H264Context *h){
3040     MpegEncContext * const s = &h->s;
3041     int ref0, ref1;
3042     int cur_poc = s->current_picture_ptr->poc;
3043
3044     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3045        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3046         h->use_weight= 0;
3047         h->use_weight_chroma= 0;
3048         return;
3049     }
3050
3051     h->use_weight= 2;
3052     h->use_weight_chroma= 2;
3053     h->luma_log2_weight_denom= 5;
3054     h->chroma_log2_weight_denom= 5;
3055
3056     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3057         int poc0 = h->ref_list[0][ref0].poc;
3058         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3059             int poc1 = h->ref_list[1][ref1].poc;
3060             int td = av_clip(poc1 - poc0, -128, 127);
3061             if(td){
3062                 int tb = av_clip(cur_poc - poc0, -128, 127);
3063                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3064                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3065                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3066                     h->implicit_weight[ref0][ref1] = 32;
3067                 else
3068                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3069             }else
3070                 h->implicit_weight[ref0][ref1] = 32;
3071         }
3072     }
3073 }
3074
3075 /**
3076  * Mark a picture as no longer needed for reference. The refmask
3077  * argument allows unreferencing of individual fields or the whole frame.
3078  * If the picture becomes entirely unreferenced, but is being held for
3079  * display purposes, it is marked as such.
3080  * @param refmask mask of fields to unreference; the mask is bitwise
3081  *                anded with the reference marking of pic
3082  * @return non-zero if pic becomes entirely unreferenced (except possibly
3083  *         for display purposes) zero if one of the fields remains in
3084  *         reference
3085  */
3086 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3087     int i;
3088     if (pic->reference &= refmask) {
3089         return 0;
3090     } else {
3091         for(i = 0; h->delayed_pic[i]; i++)
3092             if(pic == h->delayed_pic[i]){
3093                 pic->reference=DELAYED_PIC_REF;
3094                 break;
3095             }
3096         return 1;
3097     }
3098 }
3099
3100 /**
3101  * instantaneous decoder refresh.
3102  */
3103 static void idr(H264Context *h){
3104     int i;
3105
3106     for(i=0; i<16; i++){
3107         remove_long(h, i, 0);
3108     }
3109     assert(h->long_ref_count==0);
3110
3111     for(i=0; i<h->short_ref_count; i++){
3112         unreference_pic(h, h->short_ref[i], 0);
3113         h->short_ref[i]= NULL;
3114     }
3115     h->short_ref_count=0;
3116     h->prev_frame_num= 0;
3117     h->prev_frame_num_offset= 0;
3118     h->prev_poc_msb=
3119     h->prev_poc_lsb= 0;
3120 }
3121
3122 /* forget old pics after a seek */
3123 static void flush_dpb(AVCodecContext *avctx){
3124     H264Context *h= avctx->priv_data;
3125     int i;
3126     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3127         if(h->delayed_pic[i])
3128             h->delayed_pic[i]->reference= 0;
3129         h->delayed_pic[i]= NULL;
3130     }
3131     h->outputed_poc= INT_MIN;
3132     idr(h);
3133     if(h->s.current_picture_ptr)
3134         h->s.current_picture_ptr->reference= 0;
3135     h->s.first_field= 0;
3136     ff_mpeg_flush(avctx);
3137 }
3138
3139 /**
3140  * Find a Picture in the short term reference list by frame number.
3141  * @param frame_num frame number to search for
3142  * @param idx the index into h->short_ref where returned picture is found
3143  *            undefined if no picture found.
3144  * @return pointer to the found picture, or NULL if no pic with the provided
3145  *                 frame number is found
3146  */
3147 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3148     MpegEncContext * const s = &h->s;
3149     int i;
3150
3151     for(i=0; i<h->short_ref_count; i++){
3152         Picture *pic= h->short_ref[i];
3153         if(s->avctx->debug&FF_DEBUG_MMCO)
3154             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3155         if(pic->frame_num == frame_num) {
3156             *idx = i;
3157             return pic;
3158         }
3159     }
3160     return NULL;
3161 }
3162
3163 /**
3164  * Remove a picture from the short term reference list by its index in
3165  * that list.  This does no checking on the provided index; it is assumed
3166  * to be valid. Other list entries are shifted down.
3167  * @param i index into h->short_ref of picture to remove.
3168  */
3169 static void remove_short_at_index(H264Context *h, int i){
3170     assert(i >= 0 && i < h->short_ref_count);
3171     h->short_ref[i]= NULL;
3172     if (--h->short_ref_count)
3173         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3174 }
3175
3176 /**
3177  *
3178  * @return the removed picture or NULL if an error occurs
3179  */
3180 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3181     MpegEncContext * const s = &h->s;
3182     Picture *pic;
3183     int i;
3184
3185     if(s->avctx->debug&FF_DEBUG_MMCO)
3186         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3187
3188     pic = find_short(h, frame_num, &i);
3189     if (pic){
3190         if(unreference_pic(h, pic, ref_mask))
3191         remove_short_at_index(h, i);
3192     }
3193
3194     return pic;
3195 }
3196
3197 /**
3198  * Remove a picture from the long term reference list by its index in
3199  * that list.
3200  * @return the removed picture or NULL if an error occurs
3201  */
3202 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3203     Picture *pic;
3204
3205     pic= h->long_ref[i];
3206     if (pic){
3207         if(unreference_pic(h, pic, ref_mask)){
3208             assert(h->long_ref[i]->long_ref == 1);
3209             h->long_ref[i]->long_ref= 0;
3210             h->long_ref[i]= NULL;
3211             h->long_ref_count--;
3212         }
3213     }
3214
3215     return pic;
3216 }
3217
3218 /**
3219  * print short term list
3220  */
3221 static void print_short_term(H264Context *h) {
3222     uint32_t i;
3223     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3224         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3225         for(i=0; i<h->short_ref_count; i++){
3226             Picture *pic= h->short_ref[i];
3227             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3228         }
3229     }
3230 }
3231
3232 /**
3233  * print long term list
3234  */
3235 static void print_long_term(H264Context *h) {
3236     uint32_t i;
3237     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3238         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3239         for(i = 0; i < 16; i++){
3240             Picture *pic= h->long_ref[i];
3241             if (pic) {
3242                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3243             }
3244         }
3245     }
3246 }
3247
3248 /**
3249  * Executes the reference picture marking (memory management control operations).
3250  */
3251 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3252     MpegEncContext * const s = &h->s;
3253     int i, j;
3254     int current_ref_assigned=0;
3255     Picture *pic;
3256
3257     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3258         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3259
3260     for(i=0; i<mmco_count; i++){
3261         int structure, frame_num;
3262         if(s->avctx->debug&FF_DEBUG_MMCO)
3263             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3264
3265         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3266            || mmco[i].opcode == MMCO_SHORT2LONG){
3267             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3268             pic = find_short(h, frame_num, &j);
3269             if(!pic){
3270                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3271                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3272                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3273                 continue;
3274             }
3275         }
3276
3277         switch(mmco[i].opcode){
3278         case MMCO_SHORT2UNUSED:
3279             if(s->avctx->debug&FF_DEBUG_MMCO)
3280                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3281             remove_short(h, frame_num, structure ^ PICT_FRAME);
3282             break;
3283         case MMCO_SHORT2LONG:
3284                 if (h->long_ref[mmco[i].long_arg] != pic)
3285                     remove_long(h, mmco[i].long_arg, 0);
3286
3287                 remove_short_at_index(h, j);
3288                 h->long_ref[ mmco[i].long_arg ]= pic;
3289                 if (h->long_ref[ mmco[i].long_arg ]){
3290                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3291                     h->long_ref_count++;
3292                 }
3293             break;
3294         case MMCO_LONG2UNUSED:
3295             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3296             pic = h->long_ref[j];
3297             if (pic) {
3298                 remove_long(h, j, structure ^ PICT_FRAME);
3299             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3300                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3301             break;
3302         case MMCO_LONG:
3303                     // Comment below left from previous code as it is an interresting note.
3304                     /* First field in pair is in short term list or
3305                      * at a different long term index.
3306                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3307                      * Report the problem and keep the pair where it is,
3308                      * and mark this field valid.
3309                      */
3310
3311             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3312                 remove_long(h, mmco[i].long_arg, 0);
3313
3314                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3315                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3316                 h->long_ref_count++;
3317             }
3318
3319             s->current_picture_ptr->reference |= s->picture_structure;
3320             current_ref_assigned=1;
3321             break;
3322         case MMCO_SET_MAX_LONG:
3323             assert(mmco[i].long_arg <= 16);
3324             // just remove the long term which index is greater than new max
3325             for(j = mmco[i].long_arg; j<16; j++){
3326                 remove_long(h, j, 0);
3327             }
3328             break;
3329         case MMCO_RESET:
3330             while(h->short_ref_count){
3331                 remove_short(h, h->short_ref[0]->frame_num, 0);
3332             }
3333             for(j = 0; j < 16; j++) {
3334                 remove_long(h, j, 0);
3335             }
3336             s->current_picture_ptr->poc=
3337             s->current_picture_ptr->field_poc[0]=
3338             s->current_picture_ptr->field_poc[1]=
3339             h->poc_lsb=
3340             h->poc_msb=
3341             h->frame_num=
3342             s->current_picture_ptr->frame_num= 0;
3343             break;
3344         default: assert(0);
3345         }
3346     }
3347
3348     if (!current_ref_assigned) {
3349         /* Second field of complementary field pair; the first field of
3350          * which is already referenced. If short referenced, it
3351          * should be first entry in short_ref. If not, it must exist
3352          * in long_ref; trying to put it on the short list here is an
3353          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3354          */
3355         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3356             /* Just mark the second field valid */
3357             s->current_picture_ptr->reference = PICT_FRAME;
3358         } else if (s->current_picture_ptr->long_ref) {
3359             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3360                                              "assignment for second field "
3361                                              "in complementary field pair "
3362                                              "(first field is long term)\n");
3363         } else {
3364             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3365             if(pic){
3366                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3367             }
3368
3369             if(h->short_ref_count)
3370                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3371
3372             h->short_ref[0]= s->current_picture_ptr;
3373             h->short_ref_count++;
3374             s->current_picture_ptr->reference |= s->picture_structure;
3375         }
3376     }
3377
3378     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3379
3380         /* We have too many reference frames, probably due to corrupted
3381          * stream. Need to discard one frame. Prevents overrun of the
3382          * short_ref and long_ref buffers.
3383          */
3384         av_log(h->s.avctx, AV_LOG_ERROR,
3385                "number of reference frames exceeds max (probably "
3386                "corrupt input), discarding one\n");
3387
3388         if (h->long_ref_count && !h->short_ref_count) {
3389             for (i = 0; i < 16; ++i)
3390                 if (h->long_ref[i])
3391                     break;
3392
3393             assert(i < 16);
3394             remove_long(h, i, 0);
3395         } else {
3396             pic = h->short_ref[h->short_ref_count - 1];
3397             remove_short(h, pic->frame_num, 0);
3398         }
3399     }
3400
3401     print_short_term(h);
3402     print_long_term(h);
3403     return 0;
3404 }
3405
3406 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3407     MpegEncContext * const s = &h->s;
3408     int i;
3409
3410     h->mmco_index= 0;
3411     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3412         s->broken_link= get_bits1(gb) -1;
3413         if(get_bits1(gb)){
3414             h->mmco[0].opcode= MMCO_LONG;
3415             h->mmco[0].long_arg= 0;
3416             h->mmco_index= 1;
3417         }
3418     }else{
3419         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3420             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3421                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3422
3423                 h->mmco[i].opcode= opcode;
3424                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3425                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3426 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3427                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3428                         return -1;
3429                     }*/
3430                 }
3431                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3432                     unsigned int long_arg= get_ue_golomb_31(gb);
3433                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3434                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3435                         return -1;
3436                     }
3437                     h->mmco[i].long_arg= long_arg;
3438                 }
3439
3440                 if(opcode > (unsigned)MMCO_LONG){
3441                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3442                     return -1;
3443                 }
3444                 if(opcode == MMCO_END)
3445                     break;
3446             }
3447             h->mmco_index= i;
3448         }else{
3449             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3450
3451             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3452                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3453                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3454                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3455                 h->mmco_index= 1;
3456                 if (FIELD_PICTURE) {
3457                     h->mmco[0].short_pic_num *= 2;
3458                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3459                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3460                     h->mmco_index= 2;
3461                 }
3462             }
3463         }
3464     }
3465
3466     return 0;
3467 }
3468
3469 static int init_poc(H264Context *h){
3470     MpegEncContext * const s = &h->s;
3471     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3472     int field_poc[2];
3473     Picture *cur = s->current_picture_ptr;
3474
3475     h->frame_num_offset= h->prev_frame_num_offset;
3476     if(h->frame_num < h->prev_frame_num)
3477         h->frame_num_offset += max_frame_num;
3478
3479     if(h->sps.poc_type==0){
3480         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3481
3482         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3483             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3484         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3485             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3486         else
3487             h->poc_msb = h->prev_poc_msb;
3488 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3489         field_poc[0] =
3490         field_poc[1] = h->poc_msb + h->poc_lsb;
3491         if(s->picture_structure == PICT_FRAME)
3492             field_poc[1] += h->delta_poc_bottom;
3493     }else if(h->sps.poc_type==1){
3494         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3495         int i;
3496
3497         if(h->sps.poc_cycle_length != 0)
3498             abs_frame_num = h->frame_num_offset + h->frame_num;
3499         else
3500             abs_frame_num = 0;
3501
3502         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3503             abs_frame_num--;
3504
3505         expected_delta_per_poc_cycle = 0;
3506         for(i=0; i < h->sps.poc_cycle_length; i++)
3507             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3508
3509         if(abs_frame_num > 0){
3510             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3511             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3512
3513             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3514             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3515                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3516         } else
3517             expectedpoc = 0;
3518
3519         if(h->nal_ref_idc == 0)
3520             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3521
3522         field_poc[0] = expectedpoc + h->delta_poc[0];
3523         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3524
3525         if(s->picture_structure == PICT_FRAME)
3526             field_poc[1] += h->delta_poc[1];
3527     }else{
3528         int poc= 2*(h->frame_num_offset + h->frame_num);
3529
3530         if(!h->nal_ref_idc)
3531             poc--;
3532
3533         field_poc[0]= poc;
3534         field_poc[1]= poc;
3535     }
3536
3537     if(s->picture_structure != PICT_BOTTOM_FIELD)
3538         s->current_picture_ptr->field_poc[0]= field_poc[0];
3539     if(s->picture_structure != PICT_TOP_FIELD)
3540         s->current_picture_ptr->field_poc[1]= field_poc[1];
3541     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3542
3543     return 0;
3544 }
3545
3546
3547 /**
3548  * initialize scan tables
3549  */
3550 static void init_scan_tables(H264Context *h){
3551     MpegEncContext * const s = &h->s;
3552     int i;
3553     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3554         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3555         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3556     }else{
3557         for(i=0; i<16; i++){
3558 #define T(x) (x>>2) | ((x<<2) & 0xF)
3559             h->zigzag_scan[i] = T(zigzag_scan[i]);
3560             h-> field_scan[i] = T( field_scan[i]);
3561 #undef T
3562         }
3563     }
3564     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3565         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3566         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3567         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3568         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3569     }else{
3570         for(i=0; i<64; i++){
3571 #define T(x) (x>>3) | ((x&7)<<3)
3572             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3573             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3574             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3575             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3576 #undef T
3577         }
3578     }
3579     if(h->sps.transform_bypass){ //FIXME same ugly
3580         h->zigzag_scan_q0          = zigzag_scan;
3581         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3582         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3583         h->field_scan_q0           = field_scan;
3584         h->field_scan8x8_q0        = field_scan8x8;
3585         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3586     }else{
3587         h->zigzag_scan_q0          = h->zigzag_scan;
3588         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3589         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3590         h->field_scan_q0           = h->field_scan;
3591         h->field_scan8x8_q0        = h->field_scan8x8;
3592         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3593     }
3594 }
3595
3596 /**
3597  * Replicates H264 "master" context to thread contexts.
3598  */
3599 static void clone_slice(H264Context *dst, H264Context *src)
3600 {
3601     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3602     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3603     dst->s.current_picture      = src->s.current_picture;
3604     dst->s.linesize             = src->s.linesize;
3605     dst->s.uvlinesize           = src->s.uvlinesize;
3606     dst->s.first_field          = src->s.first_field;
3607
3608     dst->prev_poc_msb           = src->prev_poc_msb;
3609     dst->prev_poc_lsb           = src->prev_poc_lsb;
3610     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3611     dst->prev_frame_num         = src->prev_frame_num;
3612     dst->short_ref_count        = src->short_ref_count;
3613
3614     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3615     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3616     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3617     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3618
3619     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3620     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3621 }
3622
3623 /**
3624  * decodes a slice header.
3625  * This will also call MPV_common_init() and frame_start() as needed.
3626  *
3627  * @param h h264context
3628  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3629  *
3630  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3631  */
3632 static int decode_slice_header(H264Context *h, H264Context *h0){
3633     MpegEncContext * const s = &h->s;
3634     MpegEncContext * const s0 = &h0->s;
3635     unsigned int first_mb_in_slice;
3636     unsigned int pps_id;
3637     int num_ref_idx_active_override_flag;
3638     unsigned int slice_type, tmp, i, j;
3639     int default_ref_list_done = 0;
3640     int last_pic_structure;
3641
3642     s->dropable= h->nal_ref_idc == 0;
3643
3644     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3645         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3646         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3647     }else{
3648         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3649         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3650     }
3651
3652     first_mb_in_slice= get_ue_golomb(&s->gb);
3653
3654     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3655         h0->current_slice = 0;
3656         if (!s0->first_field)
3657             s->current_picture_ptr= NULL;
3658     }
3659
3660     slice_type= get_ue_golomb_31(&s->gb);
3661     if(slice_type > 9){
3662         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3663         return -1;
3664     }
3665     if(slice_type > 4){
3666         slice_type -= 5;
3667         h->slice_type_fixed=1;
3668     }else
3669         h->slice_type_fixed=0;
3670
3671     slice_type= golomb_to_pict_type[ slice_type ];
3672     if (slice_type == FF_I_TYPE
3673         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3674         default_ref_list_done = 1;
3675     }
3676     h->slice_type= slice_type;
3677     h->slice_type_nos= slice_type & 3;
3678
3679     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3680     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3681         av_log(h->s.avctx, AV_LOG_ERROR,
3682                "B picture before any references, skipping\n");
3683         return -1;
3684     }
3685
3686     pps_id= get_ue_golomb(&s->gb);
3687     if(pps_id>=MAX_PPS_COUNT){
3688         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3689         return -1;
3690     }
3691     if(!h0->pps_buffers[pps_id]) {
3692         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3693         return -1;
3694     }
3695     h->pps= *h0->pps_buffers[pps_id];
3696
3697     if(!h0->sps_buffers[h->pps.sps_id]) {
3698         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3699         return -1;
3700     }
3701     h->sps = *h0->sps_buffers[h->pps.sps_id];
3702
3703     if(h == h0 && h->dequant_coeff_pps != pps_id){
3704         h->dequant_coeff_pps = pps_id;
3705         init_dequant_tables(h);
3706     }
3707
3708     s->mb_width= h->sps.mb_width;
3709     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3710
3711     h->b_stride=  s->mb_width*4;
3712     h->b8_stride= s->mb_width*2;
3713
3714     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3715     if(h->sps.frame_mbs_only_flag)
3716         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3717     else
3718         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3719
3720     if (s->context_initialized
3721         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3722         if(h != h0)
3723             return -1;   // width / height changed during parallelized decoding
3724         free_tables(h);
3725         flush_dpb(s->avctx);
3726         MPV_common_end(s);
3727     }
3728     if (!s->context_initialized) {
3729         if(h != h0)
3730             return -1;  // we cant (re-)initialize context during parallel decoding
3731         if (MPV_common_init(s) < 0)
3732             return -1;
3733         s->first_field = 0;
3734
3735         init_scan_tables(h);
3736         alloc_tables(h);
3737
3738         for(i = 1; i < s->avctx->thread_count; i++) {
3739             H264Context *c;
3740             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3741             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3742             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3743             c->sps = h->sps;
3744             c->pps = h->pps;
3745             init_scan_tables(c);
3746             clone_tables(c, h);
3747         }
3748
3749         for(i = 0; i < s->avctx->thread_count; i++)
3750             if(context_init(h->thread_context[i]) < 0)
3751                 return -1;
3752
3753         s->avctx->width = s->width;
3754         s->avctx->height = s->height;
3755         s->avctx->sample_aspect_ratio= h->sps.sar;
3756         if(!s->avctx->sample_aspect_ratio.den)
3757             s->avctx->sample_aspect_ratio.den = 1;
3758
3759         if(h->sps.timing_info_present_flag){
3760             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3761             if(h->x264_build > 0 && h->x264_build < 44)
3762                 s->avctx->time_base.den *= 2;
3763             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3764                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3765         }
3766     }
3767
3768     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3769
3770     h->mb_mbaff = 0;
3771     h->mb_aff_frame = 0;
3772     last_pic_structure = s0->picture_structure;
3773     if(h->sps.frame_mbs_only_flag){
3774         s->picture_structure= PICT_FRAME;
3775     }else{
3776         if(get_bits1(&s->gb)) { //field_pic_flag
3777             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3778         } else {
3779             s->picture_structure= PICT_FRAME;
3780             h->mb_aff_frame = h->sps.mb_aff;
3781         }
3782     }
3783     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3784
3785     if(h0->current_slice == 0){
3786         while(h->frame_num !=  h->prev_frame_num &&
3787               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3788             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3789             frame_start(h);
3790             h->prev_frame_num++;
3791             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3792             s->current_picture_ptr->frame_num= h->prev_frame_num;
3793             execute_ref_pic_marking(h, NULL, 0);
3794         }
3795
3796         /* See if we have a decoded first field looking for a pair... */
3797         if (s0->first_field) {
3798             assert(s0->current_picture_ptr);
3799             assert(s0->current_picture_ptr->data[0]);
3800             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3801
3802             /* figure out if we have a complementary field pair */
3803             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3804                 /*
3805                  * Previous field is unmatched. Don't display it, but let it
3806                  * remain for reference if marked as such.
3807                  */
3808                 s0->current_picture_ptr = NULL;
3809                 s0->first_field = FIELD_PICTURE;
3810
3811             } else {
3812                 if (h->nal_ref_idc &&
3813                         s0->current_picture_ptr->reference &&
3814                         s0->current_picture_ptr->frame_num != h->frame_num) {
3815                     /*
3816                      * This and previous field were reference, but had
3817                      * different frame_nums. Consider this field first in
3818                      * pair. Throw away previous field except for reference
3819                      * purposes.
3820                      */
3821                     s0->first_field = 1;
3822                     s0->current_picture_ptr = NULL;
3823
3824                 } else {
3825                     /* Second field in complementary pair */
3826                     s0->first_field = 0;
3827                 }
3828             }
3829
3830         } else {
3831             /* Frame or first field in a potentially complementary pair */
3832             assert(!s0->current_picture_ptr);
3833             s0->first_field = FIELD_PICTURE;
3834         }
3835
3836         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3837             s0->first_field = 0;
3838             return -1;
3839         }
3840     }
3841     if(h != h0)
3842         clone_slice(h, h0);
3843
3844     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3845
3846     assert(s->mb_num == s->mb_width * s->mb_height);
3847     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3848        first_mb_in_slice                    >= s->mb_num){
3849         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3850         return -1;
3851     }
3852     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3853     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3854     if (s->picture_structure == PICT_BOTTOM_FIELD)
3855         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3856     assert(s->mb_y < s->mb_height);
3857
3858     if(s->picture_structure==PICT_FRAME){
3859         h->curr_pic_num=   h->frame_num;
3860         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3861     }else{
3862         h->curr_pic_num= 2*h->frame_num + 1;
3863         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3864     }
3865
3866     if(h->nal_unit_type == NAL_IDR_SLICE){
3867         get_ue_golomb(&s->gb); /* idr_pic_id */
3868     }
3869
3870     if(h->sps.poc_type==0){
3871         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3872
3873         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3874             h->delta_poc_bottom= get_se_golomb(&s->gb);
3875         }
3876     }
3877
3878     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3879         h->delta_poc[0]= get_se_golomb(&s->gb);
3880
3881         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3882             h->delta_poc[1]= get_se_golomb(&s->gb);
3883     }
3884
3885     init_poc(h);
3886
3887     if(h->pps.redundant_pic_cnt_present){
3888         h->redundant_pic_count= get_ue_golomb(&s->gb);
3889     }
3890
3891     //set defaults, might be overridden a few lines later
3892     h->ref_count[0]= h->pps.ref_count[0];
3893     h->ref_count[1]= h->pps.ref_count[1];
3894
3895     if(h->slice_type_nos != FF_I_TYPE){
3896         if(h->slice_type_nos == FF_B_TYPE){
3897             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3898         }
3899         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3900
3901         if(num_ref_idx_active_override_flag){
3902             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3903             if(h->slice_type_nos==FF_B_TYPE)
3904                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3905
3906             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3907                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3908                 h->ref_count[0]= h->ref_count[1]= 1;
3909                 return -1;
3910             }
3911         }
3912         if(h->slice_type_nos == FF_B_TYPE)
3913             h->list_count= 2;
3914         else
3915             h->list_count= 1;
3916     }else
3917         h->list_count= 0;
3918
3919     if(!default_ref_list_done){
3920         fill_default_ref_list(h);
3921     }
3922
3923     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3924         return -1;
3925
3926     if(h->slice_type_nos!=FF_I_TYPE){
3927         s->last_picture_ptr= &h->ref_list[0][0];
3928         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3929     }
3930     if(h->slice_type_nos==FF_B_TYPE){
3931         s->next_picture_ptr= &h->ref_list[1][0];
3932         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3933     }
3934
3935     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3936        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3937         pred_weight_table(h);
3938     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3939         implicit_weight_table(h);
3940     else
3941         h->use_weight = 0;
3942
3943     if(h->nal_ref_idc)
3944         decode_ref_pic_marking(h0, &s->gb);
3945
3946     if(FRAME_MBAFF)
3947         fill_mbaff_ref_list(h);
3948
3949     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3950         direct_dist_scale_factor(h);
3951     direct_ref_list_init(h);
3952
3953     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3954         tmp = get_ue_golomb_31(&s->gb);
3955         if(tmp > 2){
3956             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3957             return -1;
3958         }
3959         h->cabac_init_idc= tmp;
3960     }
3961
3962     h->last_qscale_diff = 0;
3963     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3964     if(tmp>51){
3965         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3966         return -1;
3967     }
3968     s->qscale= tmp;
3969     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3970     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3971     //FIXME qscale / qp ... stuff
3972     if(h->slice_type == FF_SP_TYPE){
3973         get_bits1(&s->gb); /* sp_for_switch_flag */
3974     }
3975     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3976         get_se_golomb(&s->gb); /* slice_qs_delta */
3977     }
3978
3979     h->deblocking_filter = 1;
3980     h->slice_alpha_c0_offset = 0;
3981     h->slice_beta_offset = 0;
3982     if( h->pps.deblocking_filter_parameters_present ) {
3983         tmp= get_ue_golomb_31(&s->gb);
3984         if(tmp > 2){
3985             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3986             return -1;
3987         }
3988         h->deblocking_filter= tmp;
3989         if(h->deblocking_filter < 2)
3990             h->deblocking_filter^= 1; // 1<->0
3991
3992         if( h->deblocking_filter ) {
3993             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3994             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3995         }
3996     }
3997
3998     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3999        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4000        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4001        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4002         h->deblocking_filter= 0;
4003
4004     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4005         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4006             /* Cheat slightly for speed:
4007                Do not bother to deblock across slices. */
4008             h->deblocking_filter = 2;
4009         } else {
4010             h0->max_contexts = 1;
4011             if(!h0->single_decode_warning) {
4012                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4013                 h0->single_decode_warning = 1;
4014             }
4015             if(h != h0)
4016                 return 1; // deblocking switched inside frame
4017         }
4018     }
4019
4020 #if 0 //FMO
4021     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4022         slice_group_change_cycle= get_bits(&s->gb, ?);
4023 #endif
4024
4025     h0->last_slice_type = slice_type;
4026     h->slice_num = ++h0->current_slice;
4027     if(h->slice_num >= MAX_SLICES){
4028         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4029     }
4030
4031     for(j=0; j<2; j++){
4032         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4033         ref2frm[0]=
4034         ref2frm[1]= -1;
4035         for(i=0; i<16; i++)
4036             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4037                           +(h->ref_list[j][i].reference&3);
4038         ref2frm[18+0]=
4039         ref2frm[18+1]= -1;
4040         for(i=16; i<48; i++)
4041             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4042                           +(h->ref_list[j][i].reference&3);
4043     }
4044
4045     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4046     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4047
4048     s->avctx->refs= h->sps.ref_frame_count;
4049
4050     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4051         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4052                h->slice_num,
4053                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4054                first_mb_in_slice,
4055                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4056                pps_id, h->frame_num,
4057                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4058                h->ref_count[0], h->ref_count[1],
4059                s->qscale,
4060                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4061                h->use_weight,
4062                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4063                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4064                );
4065     }
4066
4067     return 0;
4068 }
4069
4070 /**
4071  *
4072  */
4073 static inline int get_level_prefix(GetBitContext *gb){
4074     unsigned int buf;
4075     int log;
4076
4077     OPEN_READER(re, gb);
4078     UPDATE_CACHE(re, gb);
4079     buf=GET_CACHE(re, gb);
4080
4081     log= 32 - av_log2(buf);
4082 #ifdef TRACE
4083     print_bin(buf>>(32-log), log);
4084     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4085 #endif
4086
4087     LAST_SKIP_BITS(re, gb, log);
4088     CLOSE_READER(re, gb);
4089
4090     return log-1;
4091 }
4092
4093 static inline int get_dct8x8_allowed(H264Context *h){
4094     if(h->sps.direct_8x8_inference_flag)
4095         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4096     else
4097         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4098 }
4099
4100 /**
4101  * decodes a residual block.
4102  * @param n block index
4103  * @param scantable scantable
4104  * @param max_coeff number of coefficients in the block
4105  * @return <0 if an error occurred
4106  */
4107 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4108     MpegEncContext * const s = &h->s;
4109     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4110     int level[16];
4111     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4112
4113     //FIXME put trailing_onex into the context
4114
4115     if(n == CHROMA_DC_BLOCK_INDEX){
4116         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4117         total_coeff= coeff_token>>2;
4118     }else{
4119         if(n == LUMA_DC_BLOCK_INDEX){
4120             total_coeff= pred_non_zero_count(h, 0);
4121             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4122             total_coeff= coeff_token>>2;
4123         }else{
4124             total_coeff= pred_non_zero_count(h, n);
4125             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4126             total_coeff= coeff_token>>2;
4127             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4128         }
4129     }
4130
4131     //FIXME set last_non_zero?
4132
4133     if(total_coeff==0)
4134         return 0;
4135     if(total_coeff > (unsigned)max_coeff) {
4136         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4137         return -1;
4138     }
4139
4140     trailing_ones= coeff_token&3;
4141     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4142     assert(total_coeff<=16);
4143
4144     i = show_bits(gb, 3);
4145     skip_bits(gb, trailing_ones);
4146     level[0] = 1-((i&4)>>1);
4147     level[1] = 1-((i&2)   );
4148     level[2] = 1-((i&1)<<1);
4149
4150     if(trailing_ones<total_coeff) {
4151         int mask, prefix;
4152         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4153         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4154         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4155
4156         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4157         if(level_code >= 100){
4158             prefix= level_code - 100;
4159             if(prefix == LEVEL_TAB_BITS)
4160                 prefix += get_level_prefix(gb);
4161
4162             //first coefficient has suffix_length equal to 0 or 1
4163             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4164                 if(suffix_length)
4165                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4166                 else
4167                     level_code= (prefix<<suffix_length); //part
4168             }else if(prefix==14){
4169                 if(suffix_length)
4170                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4171                 else
4172                     level_code= prefix + get_bits(gb, 4); //part
4173             }else{
4174                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4175                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4176                 if(prefix>=16)
4177                     level_code += (1<<(prefix-3))-4096;
4178             }
4179
4180             if(trailing_ones < 3) level_code += 2;
4181
4182             suffix_length = 2;
4183             mask= -(level_code&1);
4184             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4185         }else{
4186             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4187
4188             suffix_length = 1;
4189             if(level_code + 3U > 6U)
4190                 suffix_length++;
4191             level[trailing_ones]= level_code;
4192         }
4193
4194         //remaining coefficients have suffix_length > 0
4195         for(i=trailing_ones+1;i<total_coeff;i++) {
4196             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4197             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4198             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4199
4200             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4201             if(level_code >= 100){
4202                 prefix= level_code - 100;
4203                 if(prefix == LEVEL_TAB_BITS){
4204                     prefix += get_level_prefix(gb);
4205                 }
4206                 if(prefix<15){
4207                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4208                 }else{
4209                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4210                     if(prefix>=16)
4211                         level_code += (1<<(prefix-3))-4096;
4212                 }
4213                 mask= -(level_code&1);
4214                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4215             }
4216             level[i]= level_code;
4217
4218             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4219                 suffix_length++;
4220         }
4221     }
4222
4223     if(total_coeff == max_coeff)
4224         zeros_left=0;
4225     else{
4226         if(n == CHROMA_DC_BLOCK_INDEX)
4227             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4228         else
4229             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4230     }
4231
4232     coeff_num = zeros_left + total_coeff - 1;
4233     j = scantable[coeff_num];
4234     if(n > 24){
4235         block[j] = level[0];
4236         for(i=1;i<total_coeff;i++) {
4237             if(zeros_left <= 0)
4238                 run_before = 0;
4239             else if(zeros_left < 7){
4240                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4241             }else{
4242                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4243             }
4244             zeros_left -= run_before;
4245             coeff_num -= 1 + run_before;
4246             j= scantable[ coeff_num ];
4247
4248             block[j]= level[i];
4249         }
4250     }else{
4251         block[j] = (level[0] * qmul[j] + 32)>>6;
4252         for(i=1;i<total_coeff;i++) {
4253             if(zeros_left <= 0)
4254                 run_before = 0;
4255             else if(zeros_left < 7){
4256                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4257             }else{
4258                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4259             }
4260             zeros_left -= run_before;
4261             coeff_num -= 1 + run_before;
4262             j= scantable[ coeff_num ];
4263
4264             block[j]= (level[i] * qmul[j] + 32)>>6;
4265         }
4266     }
4267
4268     if(zeros_left<0){
4269         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4270         return -1;
4271     }
4272
4273     return 0;
4274 }
4275
4276 static void predict_field_decoding_flag(H264Context *h){
4277     MpegEncContext * const s = &h->s;
4278     const int mb_xy= h->mb_xy;
4279     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4280                 ? s->current_picture.mb_type[mb_xy-1]
4281                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4282                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4283                 : 0;
4284     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4285 }
4286
4287 /**
4288  * decodes a P_SKIP or B_SKIP macroblock
4289  */
4290 static void decode_mb_skip(H264Context *h){
4291     MpegEncContext * const s = &h->s;
4292     const int mb_xy= h->mb_xy;
4293     int mb_type=0;
4294
4295     memset(h->non_zero_count[mb_xy], 0, 16);
4296     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4297
4298     if(MB_FIELD)
4299         mb_type|= MB_TYPE_INTERLACED;
4300
4301     if( h->slice_type_nos == FF_B_TYPE )
4302     {
4303         // just for fill_caches. pred_direct_motion will set the real mb_type
4304         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4305
4306         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4307         pred_direct_motion(h, &mb_type);
4308         mb_type|= MB_TYPE_SKIP;
4309     }
4310     else
4311     {
4312         int mx, my;
4313         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4314
4315         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4316         pred_pskip_motion(h, &mx, &my);
4317         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4318         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4319     }
4320
4321     write_back_motion(h, mb_type);
4322     s->current_picture.mb_type[mb_xy]= mb_type;
4323     s->current_picture.qscale_table[mb_xy]= s->qscale;
4324     h->slice_table[ mb_xy ]= h->slice_num;
4325     h->prev_mb_skipped= 1;
4326 }
4327
4328 /**
4329  * decodes a macroblock
4330  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4331  */
4332 static int decode_mb_cavlc(H264Context *h){
4333     MpegEncContext * const s = &h->s;
4334     int mb_xy;
4335     int partition_count;
4336     unsigned int mb_type, cbp;
4337     int dct8x8_allowed= h->pps.transform_8x8_mode;
4338
4339     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4340
4341     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4342     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4343                 down the code */
4344     if(h->slice_type_nos != FF_I_TYPE){
4345         if(s->mb_skip_run==-1)
4346             s->mb_skip_run= get_ue_golomb(&s->gb);
4347
4348         if (s->mb_skip_run--) {
4349             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4350                 if(s->mb_skip_run==0)
4351                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4352                 else
4353                     predict_field_decoding_flag(h);
4354             }
4355             decode_mb_skip(h);
4356             return 0;
4357         }
4358     }
4359     if(FRAME_MBAFF){
4360         if( (s->mb_y&1) == 0 )
4361             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4362     }
4363
4364     h->prev_mb_skipped= 0;
4365
4366     mb_type= get_ue_golomb(&s->gb);
4367     if(h->slice_type_nos == FF_B_TYPE){
4368         if(mb_type < 23){
4369             partition_count= b_mb_type_info[mb_type].partition_count;
4370             mb_type=         b_mb_type_info[mb_type].type;
4371         }else{
4372             mb_type -= 23;
4373             goto decode_intra_mb;
4374         }
4375     }else if(h->slice_type_nos == FF_P_TYPE){
4376         if(mb_type < 5){
4377             partition_count= p_mb_type_info[mb_type].partition_count;
4378             mb_type=         p_mb_type_info[mb_type].type;
4379         }else{
4380             mb_type -= 5;
4381             goto decode_intra_mb;
4382         }
4383     }else{
4384        assert(h->slice_type_nos == FF_I_TYPE);
4385         if(h->slice_type == FF_SI_TYPE && mb_type)
4386             mb_type--;
4387 decode_intra_mb:
4388         if(mb_type > 25){
4389             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4390             return -1;
4391         }
4392         partition_count=0;
4393         cbp= i_mb_type_info[mb_type].cbp;
4394         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4395         mb_type= i_mb_type_info[mb_type].type;
4396     }
4397
4398     if(MB_FIELD)
4399         mb_type |= MB_TYPE_INTERLACED;
4400
4401     h->slice_table[ mb_xy ]= h->slice_num;
4402
4403     if(IS_INTRA_PCM(mb_type)){
4404         unsigned int x;
4405
4406         // We assume these blocks are very rare so we do not optimize it.
4407         align_get_bits(&s->gb);
4408
4409         // The pixels are stored in the same order as levels in h->mb array.
4410         for(x=0; x < (CHROMA ? 384 : 256); x++){
4411             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4412         }
4413
4414         // In deblocking, the quantizer is 0
4415         s->current_picture.qscale_table[mb_xy]= 0;
4416         // All coeffs are present
4417         memset(h->non_zero_count[mb_xy], 16, 16);
4418
4419         s->current_picture.mb_type[mb_xy]= mb_type;
4420         return 0;
4421     }
4422
4423     if(MB_MBAFF){
4424         h->ref_count[0] <<= 1;
4425         h->ref_count[1] <<= 1;
4426     }
4427
4428     fill_caches(h, mb_type, 0);
4429
4430     //mb_pred
4431     if(IS_INTRA(mb_type)){
4432         int pred_mode;
4433 //            init_top_left_availability(h);
4434         if(IS_INTRA4x4(mb_type)){
4435             int i;
4436             int di = 1;
4437             if(dct8x8_allowed && get_bits1(&s->gb)){
4438                 mb_type |= MB_TYPE_8x8DCT;
4439                 di = 4;
4440             }
4441
4442 //                fill_intra4x4_pred_table(h);
4443             for(i=0; i<16; i+=di){
4444                 int mode= pred_intra_mode(h, i);
4445
4446                 if(!get_bits1(&s->gb)){
4447                     const int rem_mode= get_bits(&s->gb, 3);
4448                     mode = rem_mode + (rem_mode >= mode);
4449                 }
4450
4451                 if(di==4)
4452                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4453                 else
4454                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4455             }
4456             write_back_intra_pred_mode(h);
4457             if( check_intra4x4_pred_mode(h) < 0)
4458                 return -1;
4459         }else{
4460             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4461             if(h->intra16x16_pred_mode < 0)
4462                 return -1;
4463         }
4464         if(CHROMA){
4465             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4466             if(pred_mode < 0)
4467                 return -1;
4468             h->chroma_pred_mode= pred_mode;
4469         }
4470     }else if(partition_count==4){
4471         int i, j, sub_partition_count[4], list, ref[2][4];
4472
4473         if(h->slice_type_nos == FF_B_TYPE){
4474             for(i=0; i<4; i++){
4475                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4476                 if(h->sub_mb_type[i] >=13){
4477                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4478                     return -1;
4479                 }
4480                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4481                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4482             }
4483             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4484                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4485                 pred_direct_motion(h, &mb_type);
4486                 h->ref_cache[0][scan8[4]] =
4487                 h->ref_cache[1][scan8[4]] =
4488                 h->ref_cache[0][scan8[12]] =
4489                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4490             }
4491         }else{
4492             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4493             for(i=0; i<4; i++){
4494                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4495                 if(h->sub_mb_type[i] >=4){
4496                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4497                     return -1;
4498                 }
4499                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4500                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4501             }
4502         }
4503
4504         for(list=0; list<h->list_count; list++){
4505             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4506             for(i=0; i<4; i++){
4507                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4508                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4509                     unsigned int tmp;
4510                     if(ref_count == 1){
4511                         tmp= 0;
4512                     }else if(ref_count == 2){
4513                         tmp= get_bits1(&s->gb)^1;
4514                     }else{
4515                         tmp= get_ue_golomb_31(&s->gb);
4516                         if(tmp>=ref_count){
4517                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4518                             return -1;
4519                         }
4520                     }
4521                     ref[list][i]= tmp;
4522                 }else{
4523                  //FIXME
4524                     ref[list][i] = -1;
4525                 }
4526             }
4527         }
4528
4529         if(dct8x8_allowed)
4530             dct8x8_allowed = get_dct8x8_allowed(h);
4531
4532         for(list=0; list<h->list_count; list++){
4533             for(i=0; i<4; i++){
4534                 if(IS_DIRECT(h->sub_mb_type[i])) {
4535                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4536                     continue;
4537                 }
4538                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4539                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4540
4541                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4542                     const int sub_mb_type= h->sub_mb_type[i];
4543                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4544                     for(j=0; j<sub_partition_count[i]; j++){
4545                         int mx, my;
4546                         const int index= 4*i + block_width*j;
4547                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4548                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4549                         mx += get_se_golomb(&s->gb);
4550                         my += get_se_golomb(&s->gb);
4551                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4552
4553                         if(IS_SUB_8X8(sub_mb_type)){
4554                             mv_cache[ 1 ][0]=
4555                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4556                             mv_cache[ 1 ][1]=
4557                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4558                         }else if(IS_SUB_8X4(sub_mb_type)){
4559                             mv_cache[ 1 ][0]= mx;
4560                             mv_cache[ 1 ][1]= my;
4561                         }else if(IS_SUB_4X8(sub_mb_type)){
4562                             mv_cache[ 8 ][0]= mx;
4563                             mv_cache[ 8 ][1]= my;
4564                         }
4565                         mv_cache[ 0 ][0]= mx;
4566                         mv_cache[ 0 ][1]= my;
4567                     }
4568                 }else{
4569                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4570                     p[0] = p[1]=
4571                     p[8] = p[9]= 0;
4572                 }
4573             }
4574         }
4575     }else if(IS_DIRECT(mb_type)){
4576         pred_direct_motion(h, &mb_type);
4577         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4578     }else{
4579         int list, mx, my, i;
4580          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4581         if(IS_16X16(mb_type)){
4582             for(list=0; list<h->list_count; list++){
4583                     unsigned int val;
4584                     if(IS_DIR(mb_type, 0, list)){
4585                         if(h->ref_count[list]==1){
4586                             val= 0;
4587                         }else if(h->ref_count[list]==2){
4588                             val= get_bits1(&s->gb)^1;
4589                         }else{
4590                             val= get_ue_golomb_31(&s->gb);
4591                             if(val >= h->ref_count[list]){
4592                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4593                                 return -1;
4594                             }
4595                         }
4596                     }else
4597                         val= LIST_NOT_USED&0xFF;
4598                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4599             }
4600             for(list=0; list<h->list_count; list++){
4601                 unsigned int val;
4602                 if(IS_DIR(mb_type, 0, list)){
4603                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4604                     mx += get_se_golomb(&s->gb);
4605                     my += get_se_golomb(&s->gb);
4606                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4607
4608                     val= pack16to32(mx,my);
4609                 }else
4610                     val=0;
4611                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4612             }
4613         }
4614         else if(IS_16X8(mb_type)){
4615             for(list=0; list<h->list_count; list++){
4616                     for(i=0; i<2; i++){
4617                         unsigned int val;
4618                         if(IS_DIR(mb_type, i, list)){
4619                             if(h->ref_count[list] == 1){
4620                                 val= 0;
4621                             }else if(h->ref_count[list] == 2){
4622                                 val= get_bits1(&s->gb)^1;
4623                             }else{
4624                                 val= get_ue_golomb_31(&s->gb);
4625                                 if(val >= h->ref_count[list]){
4626                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4627                                     return -1;
4628                                 }
4629                             }
4630                         }else
4631                             val= LIST_NOT_USED&0xFF;
4632                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4633                     }
4634             }
4635             for(list=0; list<h->list_count; list++){
4636                 for(i=0; i<2; i++){
4637                     unsigned int val;
4638                     if(IS_DIR(mb_type, i, list)){
4639                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4640                         mx += get_se_golomb(&s->gb);
4641                         my += get_se_golomb(&s->gb);
4642                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4643
4644                         val= pack16to32(mx,my);
4645                     }else
4646                         val=0;
4647                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4648                 }
4649             }
4650         }else{
4651             assert(IS_8X16(mb_type));
4652             for(list=0; list<h->list_count; list++){
4653                     for(i=0; i<2; i++){
4654                         unsigned int val;
4655                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4656                             if(h->ref_count[list]==1){
4657                                 val= 0;
4658                             }else if(h->ref_count[list]==2){
4659                                 val= get_bits1(&s->gb)^1;
4660                             }else{
4661                                 val= get_ue_golomb_31(&s->gb);
4662                                 if(val >= h->ref_count[list]){
4663                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4664                                     return -1;
4665                                 }
4666                             }
4667                         }else
4668                             val= LIST_NOT_USED&0xFF;
4669                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4670                     }
4671             }
4672             for(list=0; list<h->list_count; list++){
4673                 for(i=0; i<2; i++){
4674                     unsigned int val;
4675                     if(IS_DIR(mb_type, i, list)){
4676                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4677                         mx += get_se_golomb(&s->gb);
4678                         my += get_se_golomb(&s->gb);
4679                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4680
4681                         val= pack16to32(mx,my);
4682                     }else
4683                         val=0;
4684                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4685                 }
4686             }
4687         }
4688     }
4689
4690     if(IS_INTER(mb_type))
4691         write_back_motion(h, mb_type);
4692
4693     if(!IS_INTRA16x16(mb_type)){
4694         cbp= get_ue_golomb(&s->gb);
4695         if(cbp > 47){
4696             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4697             return -1;
4698         }
4699
4700         if(CHROMA){
4701             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4702             else                     cbp= golomb_to_inter_cbp   [cbp];
4703         }else{
4704             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4705             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4706         }
4707     }
4708     h->cbp = cbp;
4709
4710     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4711         if(get_bits1(&s->gb)){
4712             mb_type |= MB_TYPE_8x8DCT;
4713             h->cbp_table[mb_xy]= cbp;
4714         }
4715     }
4716     s->current_picture.mb_type[mb_xy]= mb_type;
4717
4718     if(cbp || IS_INTRA16x16(mb_type)){
4719         int i8x8, i4x4, chroma_idx;
4720         int dquant;
4721         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4722         const uint8_t *scan, *scan8x8, *dc_scan;
4723
4724 //        fill_non_zero_count_cache(h);
4725
4726         if(IS_INTERLACED(mb_type)){
4727             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4728             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4729             dc_scan= luma_dc_field_scan;
4730         }else{
4731             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4732             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4733             dc_scan= luma_dc_zigzag_scan;
4734         }
4735
4736         dquant= get_se_golomb(&s->gb);
4737
4738         if( dquant > 25 || dquant < -26 ){
4739             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4740             return -1;
4741         }
4742
4743         s->qscale += dquant;
4744         if(((unsigned)s->qscale) > 51){
4745             if(s->qscale<0) s->qscale+= 52;
4746             else            s->qscale-= 52;
4747         }
4748
4749         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4750         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4751         if(IS_INTRA16x16(mb_type)){
4752             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4753                 return -1; //FIXME continue if partitioned and other return -1 too
4754             }
4755
4756             assert((cbp&15) == 0 || (cbp&15) == 15);
4757
4758             if(cbp&15){
4759                 for(i8x8=0; i8x8<4; i8x8++){
4760                     for(i4x4=0; i4x4<4; i4x4++){
4761                         const int index= i4x4 + 4*i8x8;
4762                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4763                             return -1;
4764                         }
4765                     }
4766                 }
4767             }else{
4768                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4769             }
4770         }else{
4771             for(i8x8=0; i8x8<4; i8x8++){
4772                 if(cbp & (1<<i8x8)){
4773                     if(IS_8x8DCT(mb_type)){
4774                         DCTELEM *buf = &h->mb[64*i8x8];
4775                         uint8_t *nnz;
4776                         for(i4x4=0; i4x4<4; i4x4++){
4777                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4778                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4779                                 return -1;
4780                         }
4781                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4782                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4783                     }else{
4784                         for(i4x4=0; i4x4<4; i4x4++){
4785                             const int index= i4x4 + 4*i8x8;
4786
4787                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4788                                 return -1;
4789                             }
4790                         }
4791                     }
4792                 }else{
4793                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4794                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4795                 }
4796             }
4797         }
4798
4799         if(cbp&0x30){
4800             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4801                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4802                     return -1;
4803                 }
4804         }
4805
4806         if(cbp&0x20){
4807             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4808                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4809                 for(i4x4=0; i4x4<4; i4x4++){
4810                     const int index= 16 + 4*chroma_idx + i4x4;
4811                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4812                         return -1;
4813                     }
4814                 }
4815             }
4816         }else{
4817             uint8_t * const nnz= &h->non_zero_count_cache[0];
4818             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4819             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4820         }
4821     }else{
4822         uint8_t * const nnz= &h->non_zero_count_cache[0];
4823         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4824         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4825         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4826     }
4827     s->current_picture.qscale_table[mb_xy]= s->qscale;
4828     write_back_non_zero_count(h);
4829
4830     if(MB_MBAFF){
4831         h->ref_count[0] >>= 1;
4832         h->ref_count[1] >>= 1;
4833     }
4834
4835     return 0;
4836 }
4837
4838 static int decode_cabac_field_decoding_flag(H264Context *h) {
4839     MpegEncContext * const s = &h->s;
4840     const int mb_x = s->mb_x;
4841     const int mb_y = s->mb_y & ~1;
4842     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4843     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4844
4845     unsigned int ctx = 0;
4846
4847     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4848         ctx += 1;
4849     }
4850     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4851         ctx += 1;
4852     }
4853
4854     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4855 }
4856
4857 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4858     uint8_t *state= &h->cabac_state[ctx_base];
4859     int mb_type;
4860
4861     if(intra_slice){
4862         MpegEncContext * const s = &h->s;
4863         const int mba_xy = h->left_mb_xy[0];
4864         const int mbb_xy = h->top_mb_xy;
4865         int ctx=0;
4866         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4867             ctx++;
4868         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4869             ctx++;
4870         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4871             return 0;   /* I4x4 */
4872         state += 2;
4873     }else{
4874         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4875             return 0;   /* I4x4 */
4876     }
4877
4878     if( get_cabac_terminate( &h->cabac ) )
4879         return 25;  /* PCM */
4880
4881     mb_type = 1; /* I16x16 */
4882     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4883     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4884         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4885     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4886     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4887     return mb_type;
4888 }
4889
4890 static int decode_cabac_mb_type_b( H264Context *h ) {
4891     MpegEncContext * const s = &h->s;
4892
4893         const int mba_xy = h->left_mb_xy[0];
4894         const int mbb_xy = h->top_mb_xy;
4895         int ctx = 0;
4896         int bits;
4897         assert(h->slice_type_nos == FF_B_TYPE);
4898
4899         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4900             ctx++;
4901         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4902             ctx++;
4903
4904         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4905             return 0; /* B_Direct_16x16 */
4906
4907         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4908             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4909         }
4910
4911         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4912         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4913         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4914         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4915         if( bits < 8 )
4916             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4917         else if( bits == 13 ) {
4918             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4919         } else if( bits == 14 )
4920             return 11; /* B_L1_L0_8x16 */
4921         else if( bits == 15 )
4922             return 22; /* B_8x8 */
4923
4924         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4925         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4926 }
4927
4928 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4929     MpegEncContext * const s = &h->s;
4930     int mba_xy, mbb_xy;
4931     int ctx = 0;
4932
4933     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4934         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4935         mba_xy = mb_xy - 1;
4936         if( (mb_y&1)
4937             && h->slice_table[mba_xy] == h->slice_num
4938             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4939             mba_xy += s->mb_stride;
4940         if( MB_FIELD ){
4941             mbb_xy = mb_xy - s->mb_stride;
4942             if( !(mb_y&1)
4943                 && h->slice_table[mbb_xy] == h->slice_num
4944                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4945                 mbb_xy -= s->mb_stride;
4946         }else
4947             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4948     }else{
4949         int mb_xy = h->mb_xy;
4950         mba_xy = mb_xy - 1;
4951         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4952     }
4953
4954     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4955         ctx++;
4956     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4957         ctx++;
4958
4959     if( h->slice_type_nos == FF_B_TYPE )
4960         ctx += 13;
4961     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4962 }
4963
4964 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4965     int mode = 0;
4966
4967     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4968         return pred_mode;
4969
4970     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4971     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4972     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4973
4974     if( mode >= pred_mode )
4975         return mode + 1;
4976     else
4977         return mode;
4978 }
4979
4980 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4981     const int mba_xy = h->left_mb_xy[0];
4982     const int mbb_xy = h->top_mb_xy;
4983
4984     int ctx = 0;
4985
4986     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4987     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4988         ctx++;
4989
4990     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4991         ctx++;
4992
4993     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4994         return 0;
4995
4996     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4997         return 1;
4998     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4999         return 2;
5000     else
5001         return 3;
5002 }
5003
5004 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5005     int cbp_b, cbp_a, ctx, cbp = 0;
5006
5007     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5008     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5009
5010     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5011     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5012     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5013     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5014     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5015     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5016     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5017     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5018     return cbp;
5019 }
5020 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5021     int ctx;
5022     int cbp_a, cbp_b;
5023
5024     cbp_a = (h->left_cbp>>4)&0x03;
5025     cbp_b = (h-> top_cbp>>4)&0x03;
5026
5027     ctx = 0;
5028     if( cbp_a > 0 ) ctx++;
5029     if( cbp_b > 0 ) ctx += 2;
5030     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5031         return 0;
5032
5033     ctx = 4;
5034     if( cbp_a == 2 ) ctx++;
5035     if( cbp_b == 2 ) ctx += 2;
5036     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5037 }
5038 static int decode_cabac_mb_dqp( H264Context *h) {
5039     int   ctx= h->last_qscale_diff != 0;
5040     int   val = 0;
5041
5042     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5043         ctx= 2+(ctx>>1);
5044         val++;
5045         if(val > 102) //prevent infinite loop
5046             return INT_MIN;
5047     }
5048
5049     if( val&0x01 )
5050         return   (val + 1)>>1 ;
5051     else
5052         return -((val + 1)>>1);
5053 }
5054 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5055     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5056         return 0;   /* 8x8 */
5057     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5058         return 1;   /* 8x4 */
5059     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5060         return 2;   /* 4x8 */
5061     return 3;       /* 4x4 */
5062 }
5063 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5064     int type;
5065     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5066         return 0;   /* B_Direct_8x8 */
5067     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5068         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5069     type = 3;
5070     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5071         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5072             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5073         type += 4;
5074     }
5075     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5076     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5077     return type;
5078 }
5079
5080 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5081     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5082 }
5083
5084 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5085     int refa = h->ref_cache[list][scan8[n] - 1];
5086     int refb = h->ref_cache[list][scan8[n] - 8];
5087     int ref  = 0;
5088     int ctx  = 0;
5089
5090     if( h->slice_type_nos == FF_B_TYPE) {
5091         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5092             ctx++;
5093         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5094             ctx += 2;
5095     } else {
5096         if( refa > 0 )
5097             ctx++;
5098         if( refb > 0 )
5099             ctx += 2;
5100     }
5101
5102     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5103         ref++;
5104         ctx = (ctx>>2)+4;
5105         if(ref >= 32 /*h->ref_list[list]*/){
5106             return -1;
5107         }
5108     }
5109     return ref;
5110 }
5111
5112 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5113     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5114                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5115     int ctxbase = (l == 0) ? 40 : 47;
5116     int mvd;
5117     int ctx = (amvd>2) + (amvd>32);
5118
5119     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5120         return 0;
5121
5122     mvd= 1;
5123     ctx= 3;
5124     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5125         mvd++;
5126         if( ctx < 6 )
5127             ctx++;
5128     }
5129
5130     if( mvd >= 9 ) {
5131         int k = 3;
5132         while( get_cabac_bypass( &h->cabac ) ) {
5133             mvd += 1 << k;
5134             k++;
5135             if(k>24){
5136                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5137                 return INT_MIN;
5138             }
5139         }
5140         while( k-- ) {
5141             if( get_cabac_bypass( &h->cabac ) )
5142                 mvd += 1 << k;
5143         }
5144     }
5145     return get_cabac_bypass_sign( &h->cabac, -mvd );
5146 }
5147
5148 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5149     int nza, nzb;
5150     int ctx = 0;
5151
5152     if( is_dc ) {
5153         if( cat == 0 ) {
5154             nza = h->left_cbp&0x100;
5155             nzb = h-> top_cbp&0x100;
5156         } else {
5157             nza = (h->left_cbp>>(6+idx))&0x01;
5158             nzb = (h-> top_cbp>>(6+idx))&0x01;
5159         }
5160     } else {
5161         assert(cat == 1 || cat == 2 || cat == 4);
5162         nza = h->non_zero_count_cache[scan8[idx] - 1];
5163         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5164     }
5165
5166     if( nza > 0 )
5167         ctx++;
5168
5169     if( nzb > 0 )
5170         ctx += 2;
5171
5172     return ctx + 4 * cat;
5173 }
5174
5175 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5176     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5177     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5178     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5179     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5180 };
5181
5182 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5183     static const int significant_coeff_flag_offset[2][6] = {
5184       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5185       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5186     };
5187     static const int last_coeff_flag_offset[2][6] = {
5188       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5189       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5190     };
5191     static const int coeff_abs_level_m1_offset[6] = {
5192         227+0, 227+10, 227+20, 227+30, 227+39, 426
5193     };
5194     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5195       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5196         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5197         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5198        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5199       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5200         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5201         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5202         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5203     };
5204     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5205      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5206      * map node ctx => cabac ctx for level=1 */
5207     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5208     /* map node ctx => cabac ctx for level>1 */
5209     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5210     static const uint8_t coeff_abs_level_transition[2][8] = {
5211     /* update node ctx after decoding a level=1 */
5212         { 1, 2, 3, 3, 4, 5, 6, 7 },
5213     /* update node ctx after decoding a level>1 */
5214         { 4, 4, 4, 4, 5, 6, 7, 7 }
5215     };
5216
5217     int index[64];
5218
5219     int av_unused last;
5220     int coeff_count = 0;
5221     int node_ctx = 0;
5222
5223     uint8_t *significant_coeff_ctx_base;
5224     uint8_t *last_coeff_ctx_base;
5225     uint8_t *abs_level_m1_ctx_base;
5226
5227 #ifndef ARCH_X86
5228 #define CABAC_ON_STACK
5229 #endif
5230 #ifdef CABAC_ON_STACK
5231 #define CC &cc
5232     CABACContext cc;
5233     cc.range     = h->cabac.range;
5234     cc.low       = h->cabac.low;
5235     cc.bytestream= h->cabac.bytestream;
5236 #else
5237 #define CC &h->cabac
5238 #endif
5239
5240
5241     /* cat: 0-> DC 16x16  n = 0
5242      *      1-> AC 16x16  n = luma4x4idx
5243      *      2-> Luma4x4   n = luma4x4idx
5244      *      3-> DC Chroma n = iCbCr
5245      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5246      *      5-> Luma8x8   n = 4 * luma8x8idx
5247      */
5248
5249     /* read coded block flag */
5250     if( is_dc || cat != 5 ) {
5251         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5252             if( !is_dc )
5253                 h->non_zero_count_cache[scan8[n]] = 0;
5254
5255 #ifdef CABAC_ON_STACK
5256             h->cabac.range     = cc.range     ;
5257             h->cabac.low       = cc.low       ;
5258             h->cabac.bytestream= cc.bytestream;
5259 #endif
5260             return;
5261         }
5262     }
5263
5264     significant_coeff_ctx_base = h->cabac_state
5265         + significant_coeff_flag_offset[MB_FIELD][cat];
5266     last_coeff_ctx_base = h->cabac_state
5267         + last_coeff_flag_offset[MB_FIELD][cat];
5268     abs_level_m1_ctx_base = h->cabac_state
5269         + coeff_abs_level_m1_offset[cat];
5270
5271     if( !is_dc && cat == 5 ) {
5272 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5273         for(last= 0; last < coefs; last++) { \
5274             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5275             if( get_cabac( CC, sig_ctx )) { \
5276                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5277                 index[coeff_count++] = last; \
5278                 if( get_cabac( CC, last_ctx ) ) { \
5279                     last= max_coeff; \
5280                     break; \
5281                 } \
5282             } \
5283         }\
5284         if( last == max_coeff -1 ) {\
5285             index[coeff_count++] = last;\
5286         }
5287         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5288 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5289         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5290     } else {
5291         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5292 #else
5293         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5294     } else {
5295         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5296 #endif
5297     }
5298     assert(coeff_count > 0);
5299
5300     if( is_dc ) {
5301         if( cat == 0 )
5302             h->cbp_table[h->mb_xy] |= 0x100;
5303         else
5304             h->cbp_table[h->mb_xy] |= 0x40 << n;
5305     } else {
5306         if( cat == 5 )
5307             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5308         else {
5309             assert( cat == 1 || cat == 2 || cat == 4 );
5310             h->non_zero_count_cache[scan8[n]] = coeff_count;
5311         }
5312     }
5313
5314     do {
5315         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5316
5317         int j= scantable[index[--coeff_count]];
5318
5319         if( get_cabac( CC, ctx ) == 0 ) {
5320             node_ctx = coeff_abs_level_transition[0][node_ctx];
5321             if( is_dc ) {
5322                 block[j] = get_cabac_bypass_sign( CC, -1);
5323             }else{
5324                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5325             }
5326         } else {
5327             int coeff_abs = 2;
5328             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5329             node_ctx = coeff_abs_level_transition[1][node_ctx];
5330
5331             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5332                 coeff_abs++;
5333             }
5334
5335             if( coeff_abs >= 15 ) {
5336                 int j = 0;
5337                 while( get_cabac_bypass( CC ) ) {
5338                     j++;
5339                 }
5340
5341                 coeff_abs=1;
5342                 while( j-- ) {
5343                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5344                 }
5345                 coeff_abs+= 14;
5346             }
5347
5348             if( is_dc ) {
5349                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5350             }else{
5351                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5352             }
5353         }
5354     } while( coeff_count );
5355 #ifdef CABAC_ON_STACK
5356             h->cabac.range     = cc.range     ;
5357             h->cabac.low       = cc.low       ;
5358             h->cabac.bytestream= cc.bytestream;
5359 #endif
5360
5361 }
5362
5363 #ifndef CONFIG_SMALL
5364 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5365     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5366 }
5367
5368 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5369     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5370 }
5371 #endif
5372
5373 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5374 #ifdef CONFIG_SMALL
5375     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5376 #else
5377     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5378     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5379 #endif
5380 }
5381
5382 static inline void compute_mb_neighbors(H264Context *h)
5383 {
5384     MpegEncContext * const s = &h->s;
5385     const int mb_xy  = h->mb_xy;
5386     h->top_mb_xy     = mb_xy - s->mb_stride;
5387     h->left_mb_xy[0] = mb_xy - 1;
5388     if(FRAME_MBAFF){
5389         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5390         const int top_pair_xy      = pair_xy     - s->mb_stride;
5391         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5392         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5393         const int curr_mb_field_flag = MB_FIELD;
5394         const int bottom = (s->mb_y & 1);
5395
5396         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5397             h->top_mb_xy -= s->mb_stride;
5398         }
5399         if (!left_mb_field_flag == curr_mb_field_flag) {
5400             h->left_mb_xy[0] = pair_xy - 1;
5401         }
5402     } else if (FIELD_PICTURE) {
5403         h->top_mb_xy -= s->mb_stride;
5404     }
5405     return;
5406 }
5407
5408 /**
5409  * decodes a macroblock
5410  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5411  */
5412 static int decode_mb_cabac(H264Context *h) {
5413     MpegEncContext * const s = &h->s;
5414     int mb_xy;
5415     int mb_type, partition_count, cbp = 0;
5416     int dct8x8_allowed= h->pps.transform_8x8_mode;
5417
5418     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5419
5420     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5421     if( h->slice_type_nos != FF_I_TYPE ) {
5422         int skip;
5423         /* a skipped mb needs the aff flag from the following mb */
5424         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5425             predict_field_decoding_flag(h);
5426         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5427             skip = h->next_mb_skipped;
5428         else
5429             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5430         /* read skip flags */
5431         if( skip ) {
5432             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5433                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5434                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5435                 if(!h->next_mb_skipped)
5436                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5437             }
5438
5439             decode_mb_skip(h);
5440
5441             h->cbp_table[mb_xy] = 0;
5442             h->chroma_pred_mode_table[mb_xy] = 0;
5443             h->last_qscale_diff = 0;
5444
5445             return 0;
5446
5447         }
5448     }
5449     if(FRAME_MBAFF){
5450         if( (s->mb_y&1) == 0 )
5451             h->mb_mbaff =
5452             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5453     }
5454
5455     h->prev_mb_skipped = 0;
5456
5457     compute_mb_neighbors(h);
5458
5459     if( h->slice_type_nos == FF_B_TYPE ) {
5460         mb_type = decode_cabac_mb_type_b( h );
5461         if( mb_type < 23 ){
5462             partition_count= b_mb_type_info[mb_type].partition_count;
5463             mb_type=         b_mb_type_info[mb_type].type;
5464         }else{
5465             mb_type -= 23;
5466             goto decode_intra_mb;
5467         }
5468     } else if( h->slice_type_nos == FF_P_TYPE ) {
5469         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5470             /* P-type */
5471             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5472                 /* P_L0_D16x16, P_8x8 */
5473                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5474             } else {
5475                 /* P_L0_D8x16, P_L0_D16x8 */
5476                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5477             }
5478             partition_count= p_mb_type_info[mb_type].partition_count;
5479             mb_type=         p_mb_type_info[mb_type].type;
5480         } else {
5481             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5482             goto decode_intra_mb;
5483         }
5484     } else {
5485         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5486         if(h->slice_type == FF_SI_TYPE && mb_type)
5487             mb_type--;
5488         assert(h->slice_type_nos == FF_I_TYPE);
5489 decode_intra_mb:
5490         partition_count = 0;
5491         cbp= i_mb_type_info[mb_type].cbp;
5492         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5493         mb_type= i_mb_type_info[mb_type].type;
5494     }
5495     if(MB_FIELD)
5496         mb_type |= MB_TYPE_INTERLACED;
5497
5498     h->slice_table[ mb_xy ]= h->slice_num;
5499
5500     if(IS_INTRA_PCM(mb_type)) {
5501         const uint8_t *ptr;
5502
5503         // We assume these blocks are very rare so we do not optimize it.
5504         // FIXME The two following lines get the bitstream position in the cabac
5505         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5506         ptr= h->cabac.bytestream;
5507         if(h->cabac.low&0x1) ptr--;
5508         if(CABAC_BITS==16){
5509             if(h->cabac.low&0x1FF) ptr--;
5510         }
5511
5512         // The pixels are stored in the same order as levels in h->mb array.
5513         memcpy(h->mb, ptr, 256); ptr+=256;
5514         if(CHROMA){
5515             memcpy(h->mb+128, ptr, 128); ptr+=128;
5516         }
5517
5518         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5519
5520         // All blocks are present
5521         h->cbp_table[mb_xy] = 0x1ef;
5522         h->chroma_pred_mode_table[mb_xy] = 0;
5523         // In deblocking, the quantizer is 0
5524         s->current_picture.qscale_table[mb_xy]= 0;
5525         // All coeffs are present
5526         memset(h->non_zero_count[mb_xy], 16, 16);
5527         s->current_picture.mb_type[mb_xy]= mb_type;
5528         h->last_qscale_diff = 0;
5529         return 0;
5530     }
5531
5532     if(MB_MBAFF){
5533         h->ref_count[0] <<= 1;
5534         h->ref_count[1] <<= 1;
5535     }
5536
5537     fill_caches(h, mb_type, 0);
5538
5539     if( IS_INTRA( mb_type ) ) {
5540         int i, pred_mode;
5541         if( IS_INTRA4x4( mb_type ) ) {
5542             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5543                 mb_type |= MB_TYPE_8x8DCT;
5544                 for( i = 0; i < 16; i+=4 ) {
5545                     int pred = pred_intra_mode( h, i );
5546                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5547                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5548                 }
5549             } else {
5550                 for( i = 0; i < 16; i++ ) {
5551                     int pred = pred_intra_mode( h, i );
5552                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5553
5554                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5555                 }
5556             }
5557             write_back_intra_pred_mode(h);
5558             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5559         } else {
5560             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5561             if( h->intra16x16_pred_mode < 0 ) return -1;
5562         }
5563         if(CHROMA){
5564             h->chroma_pred_mode_table[mb_xy] =
5565             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5566
5567             pred_mode= check_intra_pred_mode( h, pred_mode );
5568             if( pred_mode < 0 ) return -1;
5569             h->chroma_pred_mode= pred_mode;
5570         }
5571     } else if( partition_count == 4 ) {
5572         int i, j, sub_partition_count[4], list, ref[2][4];
5573
5574         if( h->slice_type_nos == FF_B_TYPE ) {
5575             for( i = 0; i < 4; i++ ) {
5576                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5577                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5578                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5579             }
5580             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5581                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5582                 pred_direct_motion(h, &mb_type);
5583                 h->ref_cache[0][scan8[4]] =
5584                 h->ref_cache[1][scan8[4]] =
5585                 h->ref_cache[0][scan8[12]] =
5586                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5587                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5588                     for( i = 0; i < 4; i++ )
5589                         if( IS_DIRECT(h->sub_mb_type[i]) )
5590                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5591                 }
5592             }
5593         } else {
5594             for( i = 0; i < 4; i++ ) {
5595                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5596                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5597                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5598             }
5599         }
5600
5601         for( list = 0; list < h->list_count; list++ ) {
5602                 for( i = 0; i < 4; i++ ) {
5603                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5604                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5605                         if( h->ref_count[list] > 1 ){
5606                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5607                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5608                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5609                                 return -1;
5610                             }
5611                         }else
5612                             ref[list][i] = 0;
5613                     } else {
5614                         ref[list][i] = -1;
5615                     }
5616                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5617                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5618                 }
5619         }
5620
5621         if(dct8x8_allowed)
5622             dct8x8_allowed = get_dct8x8_allowed(h);
5623
5624         for(list=0; list<h->list_count; list++){
5625             for(i=0; i<4; i++){
5626                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5627                 if(IS_DIRECT(h->sub_mb_type[i])){
5628                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5629                     continue;
5630                 }
5631
5632                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5633                     const int sub_mb_type= h->sub_mb_type[i];
5634                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5635                     for(j=0; j<sub_partition_count[i]; j++){
5636                         int mpx, mpy;
5637                         int mx, my;
5638                         const int index= 4*i + block_width*j;
5639                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5640                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5641                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5642
5643                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5644                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5645                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5646
5647                         if(IS_SUB_8X8(sub_mb_type)){
5648                             mv_cache[ 1 ][0]=
5649                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5650                             mv_cache[ 1 ][1]=
5651                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5652
5653                             mvd_cache[ 1 ][0]=
5654                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5655                             mvd_cache[ 1 ][1]=
5656                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5657                         }else if(IS_SUB_8X4(sub_mb_type)){
5658                             mv_cache[ 1 ][0]= mx;
5659                             mv_cache[ 1 ][1]= my;
5660
5661                             mvd_cache[ 1 ][0]= mx - mpx;
5662                             mvd_cache[ 1 ][1]= my - mpy;
5663                         }else if(IS_SUB_4X8(sub_mb_type)){
5664                             mv_cache[ 8 ][0]= mx;
5665                             mv_cache[ 8 ][1]= my;
5666
5667                             mvd_cache[ 8 ][0]= mx - mpx;
5668                             mvd_cache[ 8 ][1]= my - mpy;
5669                         }
5670                         mv_cache[ 0 ][0]= mx;
5671                         mv_cache[ 0 ][1]= my;
5672
5673                         mvd_cache[ 0 ][0]= mx - mpx;
5674                         mvd_cache[ 0 ][1]= my - mpy;
5675                     }
5676                 }else{
5677                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5678                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5679                     p[0] = p[1] = p[8] = p[9] = 0;
5680                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5681                 }
5682             }
5683         }
5684     } else if( IS_DIRECT(mb_type) ) {
5685         pred_direct_motion(h, &mb_type);
5686         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5687         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5688         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5689     } else {
5690         int list, mx, my, i, mpx, mpy;
5691         if(IS_16X16(mb_type)){
5692             for(list=0; list<h->list_count; list++){
5693                 if(IS_DIR(mb_type, 0, list)){
5694                     int ref;
5695                     if(h->ref_count[list] > 1){
5696                         ref= decode_cabac_mb_ref(h, list, 0);
5697                         if(ref >= (unsigned)h->ref_count[list]){
5698                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5699                             return -1;
5700                         }
5701                     }else
5702                         ref=0;
5703                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5704                 }else
5705                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5706             }
5707             for(list=0; list<h->list_count; list++){
5708                 if(IS_DIR(mb_type, 0, list)){
5709                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5710
5711                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5712                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5713                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5714
5715                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5716                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5717                 }else
5718                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5719             }
5720         }
5721         else if(IS_16X8(mb_type)){
5722             for(list=0; list<h->list_count; list++){
5723                     for(i=0; i<2; i++){
5724                         if(IS_DIR(mb_type, i, list)){
5725                             int ref;
5726                             if(h->ref_count[list] > 1){
5727                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5728                                 if(ref >= (unsigned)h->ref_count[list]){
5729                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5730                                     return -1;
5731                                 }
5732                             }else
5733                                 ref=0;
5734                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5735                         }else
5736                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5737                     }
5738             }
5739             for(list=0; list<h->list_count; list++){
5740                 for(i=0; i<2; i++){
5741                     if(IS_DIR(mb_type, i, list)){
5742                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5743                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5744                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5745                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5746
5747                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5748                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5749                     }else{
5750                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5751                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5752                     }
5753                 }
5754             }
5755         }else{
5756             assert(IS_8X16(mb_type));
5757             for(list=0; list<h->list_count; list++){
5758                     for(i=0; i<2; i++){
5759                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5760                             int ref;
5761                             if(h->ref_count[list] > 1){
5762                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5763                                 if(ref >= (unsigned)h->ref_count[list]){
5764                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5765                                     return -1;
5766                                 }
5767                             }else
5768                                 ref=0;
5769                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5770                         }else
5771                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5772                     }
5773             }
5774             for(list=0; list<h->list_count; list++){
5775                 for(i=0; i<2; i++){
5776                     if(IS_DIR(mb_type, i, list)){
5777                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5778                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5779                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5780
5781                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5782                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5783                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5784                     }else{
5785                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5786                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5787                     }
5788                 }
5789             }
5790         }
5791     }
5792
5793    if( IS_INTER( mb_type ) ) {
5794         h->chroma_pred_mode_table[mb_xy] = 0;
5795         write_back_motion( h, mb_type );
5796    }
5797
5798     if( !IS_INTRA16x16( mb_type ) ) {
5799         cbp  = decode_cabac_mb_cbp_luma( h );
5800         if(CHROMA)
5801             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5802     }
5803
5804     h->cbp_table[mb_xy] = h->cbp = cbp;
5805
5806     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5807         if( decode_cabac_mb_transform_size( h ) )
5808             mb_type |= MB_TYPE_8x8DCT;
5809     }
5810     s->current_picture.mb_type[mb_xy]= mb_type;
5811
5812     if( cbp || IS_INTRA16x16( mb_type ) ) {
5813         const uint8_t *scan, *scan8x8, *dc_scan;
5814         const uint32_t *qmul;
5815         int dqp;
5816
5817         if(IS_INTERLACED(mb_type)){
5818             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5819             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5820             dc_scan= luma_dc_field_scan;
5821         }else{
5822             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5823             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5824             dc_scan= luma_dc_zigzag_scan;
5825         }
5826
5827         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5828         if( dqp == INT_MIN ){
5829             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5830             return -1;
5831         }
5832         s->qscale += dqp;
5833         if(((unsigned)s->qscale) > 51){
5834             if(s->qscale<0) s->qscale+= 52;
5835             else            s->qscale-= 52;
5836         }
5837         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5838         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5839
5840         if( IS_INTRA16x16( mb_type ) ) {
5841             int i;
5842             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5843             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5844
5845             if( cbp&15 ) {
5846                 qmul = h->dequant4_coeff[0][s->qscale];
5847                 for( i = 0; i < 16; i++ ) {
5848                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5849                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5850                 }
5851             } else {
5852                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5853             }
5854         } else {
5855             int i8x8, i4x4;
5856             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5857                 if( cbp & (1<<i8x8) ) {
5858                     if( IS_8x8DCT(mb_type) ) {
5859                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5860                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5861                     } else {
5862                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5863                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5864                             const int index = 4*i8x8 + i4x4;
5865                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5866 //START_TIMER
5867                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5868 //STOP_TIMER("decode_residual")
5869                         }
5870                     }
5871                 } else {
5872                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5873                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5874                 }
5875             }
5876         }
5877
5878         if( cbp&0x30 ){
5879             int c;
5880             for( c = 0; c < 2; c++ ) {
5881                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5882                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5883             }
5884         }
5885
5886         if( cbp&0x20 ) {
5887             int c, i;
5888             for( c = 0; c < 2; c++ ) {
5889                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5890                 for( i = 0; i < 4; i++ ) {
5891                     const int index = 16 + 4 * c + i;
5892                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5893                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5894                 }
5895             }
5896         } else {
5897             uint8_t * const nnz= &h->non_zero_count_cache[0];
5898             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5899             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5900         }
5901     } else {
5902         uint8_t * const nnz= &h->non_zero_count_cache[0];
5903         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5904         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5905         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5906         h->last_qscale_diff = 0;
5907     }
5908
5909     s->current_picture.qscale_table[mb_xy]= s->qscale;
5910     write_back_non_zero_count(h);
5911
5912     if(MB_MBAFF){
5913         h->ref_count[0] >>= 1;
5914         h->ref_count[1] >>= 1;
5915     }
5916
5917     return 0;
5918 }
5919
5920
5921 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5922     const int index_a = qp + h->slice_alpha_c0_offset;
5923     const int alpha = (alpha_table+52)[index_a];
5924     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5925
5926     if( bS[0] < 4 ) {
5927         int8_t tc[4];
5928         tc[0] = (tc0_table+52)[index_a][bS[0]];
5929         tc[1] = (tc0_table+52)[index_a][bS[1]];
5930         tc[2] = (tc0_table+52)[index_a][bS[2]];
5931         tc[3] = (tc0_table+52)[index_a][bS[3]];
5932         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5933     } else {
5934         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5935     }
5936 }
5937 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5938     const int index_a = qp + h->slice_alpha_c0_offset;
5939     const int alpha = (alpha_table+52)[index_a];
5940     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5941
5942     if( bS[0] < 4 ) {
5943         int8_t tc[4];
5944         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5945         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5946         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5947         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5948         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5949     } else {
5950         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5951     }
5952 }
5953
5954 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5955     int i;
5956     for( i = 0; i < 16; i++, pix += stride) {
5957         int index_a;
5958         int alpha;
5959         int beta;
5960
5961         int qp_index;
5962         int bS_index = (i >> 1);
5963         if (!MB_FIELD) {
5964             bS_index &= ~1;
5965             bS_index |= (i & 1);
5966         }
5967
5968         if( bS[bS_index] == 0 ) {
5969             continue;
5970         }
5971
5972         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5973         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5974         alpha = (alpha_table+52)[index_a];
5975         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5976
5977         if( bS[bS_index] < 4 ) {
5978             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5979             const int p0 = pix[-1];
5980             const int p1 = pix[-2];
5981             const int p2 = pix[-3];
5982             const int q0 = pix[0];
5983             const int q1 = pix[1];
5984             const int q2 = pix[2];
5985
5986             if( FFABS( p0 - q0 ) < alpha &&
5987                 FFABS( p1 - p0 ) < beta &&
5988                 FFABS( q1 - q0 ) < beta ) {
5989                 int tc = tc0;
5990                 int i_delta;
5991
5992                 if( FFABS( p2 - p0 ) < beta ) {
5993                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5994                     tc++;
5995                 }
5996                 if( FFABS( q2 - q0 ) < beta ) {
5997                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5998                     tc++;
5999                 }
6000
6001                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6002                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6003                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6004                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6005             }
6006         }else{
6007             const int p0 = pix[-1];
6008             const int p1 = pix[-2];
6009             const int p2 = pix[-3];
6010
6011             const int q0 = pix[0];
6012             const int q1 = pix[1];
6013             const int q2 = pix[2];
6014
6015             if( FFABS( p0 - q0 ) < alpha &&
6016                 FFABS( p1 - p0 ) < beta &&
6017                 FFABS( q1 - q0 ) < beta ) {
6018
6019                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6020                     if( FFABS( p2 - p0 ) < beta)
6021                     {
6022                         const int p3 = pix[-4];
6023                         /* p0', p1', p2' */
6024                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6025                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6026                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6027                     } else {
6028                         /* p0' */
6029                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6030                     }
6031                     if( FFABS( q2 - q0 ) < beta)
6032                     {
6033                         const int q3 = pix[3];
6034                         /* q0', q1', q2' */
6035                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6036                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6037                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6038                     } else {
6039                         /* q0' */
6040                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6041                     }
6042                 }else{
6043                     /* p0', q0' */
6044                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6045                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6046                 }
6047                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6048             }
6049         }
6050     }
6051 }
6052 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6053     int i;
6054     for( i = 0; i < 8; i++, pix += stride) {
6055         int index_a;
6056         int alpha;
6057         int beta;
6058
6059         int qp_index;
6060         int bS_index = i;
6061
6062         if( bS[bS_index] == 0 ) {
6063             continue;
6064         }
6065
6066         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6067         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6068         alpha = (alpha_table+52)[index_a];
6069         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6070
6071         if( bS[bS_index] < 4 ) {
6072             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6073             const int p0 = pix[-1];
6074             const int p1 = pix[-2];
6075             const int q0 = pix[0];
6076             const int q1 = pix[1];
6077
6078             if( FFABS( p0 - q0 ) < alpha &&
6079                 FFABS( p1 - p0 ) < beta &&
6080                 FFABS( q1 - q0 ) < beta ) {
6081                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6082
6083                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6084                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6085                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6086             }
6087         }else{
6088             const int p0 = pix[-1];
6089             const int p1 = pix[-2];
6090             const int q0 = pix[0];
6091             const int q1 = pix[1];
6092
6093             if( FFABS( p0 - q0 ) < alpha &&
6094                 FFABS( p1 - p0 ) < beta &&
6095                 FFABS( q1 - q0 ) < beta ) {
6096
6097                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6098                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6099                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6100             }
6101         }
6102     }
6103 }
6104
6105 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6106     const int index_a = qp + h->slice_alpha_c0_offset;
6107     const int alpha = (alpha_table+52)[index_a];
6108     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6109
6110     if( bS[0] < 4 ) {
6111         int8_t tc[4];
6112         tc[0] = (tc0_table+52)[index_a][bS[0]];
6113         tc[1] = (tc0_table+52)[index_a][bS[1]];
6114         tc[2] = (tc0_table+52)[index_a][bS[2]];
6115         tc[3] = (tc0_table+52)[index_a][bS[3]];
6116         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6117     } else {
6118         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6119     }
6120 }
6121
6122 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6123     const int index_a = qp + h->slice_alpha_c0_offset;
6124     const int alpha = (alpha_table+52)[index_a];
6125     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6126
6127     if( bS[0] < 4 ) {
6128         int8_t tc[4];
6129         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6130         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6131         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6132         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6133         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6134     } else {
6135         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6136     }
6137 }
6138
6139 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6140     MpegEncContext * const s = &h->s;
6141     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6142     int mb_xy, mb_type;
6143     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6144
6145     mb_xy = h->mb_xy;
6146
6147     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6148         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6149        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6150                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6151         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6152         return;
6153     }
6154     assert(!FRAME_MBAFF);
6155
6156     mb_type = s->current_picture.mb_type[mb_xy];
6157     qp = s->current_picture.qscale_table[mb_xy];
6158     qp0 = s->current_picture.qscale_table[mb_xy-1];
6159     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6160     qpc = get_chroma_qp( h, 0, qp );
6161     qpc0 = get_chroma_qp( h, 0, qp0 );
6162     qpc1 = get_chroma_qp( h, 0, qp1 );
6163     qp0 = (qp + qp0 + 1) >> 1;
6164     qp1 = (qp + qp1 + 1) >> 1;
6165     qpc0 = (qpc + qpc0 + 1) >> 1;
6166     qpc1 = (qpc + qpc1 + 1) >> 1;
6167     qp_thresh = 15 - h->slice_alpha_c0_offset;
6168     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6169        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6170         return;
6171
6172     if( IS_INTRA(mb_type) ) {
6173         int16_t bS4[4] = {4,4,4,4};
6174         int16_t bS3[4] = {3,3,3,3};
6175         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6176         if( IS_8x8DCT(mb_type) ) {
6177             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6178             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6179             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6180             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6181         } else {
6182             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6183             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6184             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6185             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6186             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6187             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6188             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6189             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6190         }
6191         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6192         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6193         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6194         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6195         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6196         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6197         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6198         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6199         return;
6200     } else {
6201         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6202         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6203         int edges;
6204         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6205             edges = 4;
6206             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6207         } else {
6208             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6209                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6210             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6211                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6212                              ? 3 : 0;
6213             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6214             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6215             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6216                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6217         }
6218         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6219             bSv[0][0] = 0x0004000400040004ULL;
6220         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6221             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6222
6223 #define FILTER(hv,dir,edge)\
6224         if(bSv[dir][edge]) {\
6225             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6226             if(!(edge&1)) {\
6227                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6228                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6229             }\
6230         }
6231         if( edges == 1 ) {
6232             FILTER(v,0,0);
6233             FILTER(h,1,0);
6234         } else if( IS_8x8DCT(mb_type) ) {
6235             FILTER(v,0,0);
6236             FILTER(v,0,2);
6237             FILTER(h,1,0);
6238             FILTER(h,1,2);
6239         } else {
6240             FILTER(v,0,0);
6241             FILTER(v,0,1);
6242             FILTER(v,0,2);
6243             FILTER(v,0,3);
6244             FILTER(h,1,0);
6245             FILTER(h,1,1);
6246             FILTER(h,1,2);
6247             FILTER(h,1,3);
6248         }
6249 #undef FILTER
6250     }
6251 }
6252
6253
6254 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6255     MpegEncContext * const s = &h->s;
6256     int edge;
6257     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6258     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6259     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6260     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6261     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6262
6263     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6264                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6265     // how often to recheck mv-based bS when iterating between edges
6266     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6267                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6268     // how often to recheck mv-based bS when iterating along each edge
6269     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6270
6271     if (first_vertical_edge_done) {
6272         start = 1;
6273     }
6274
6275     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6276         start = 1;
6277
6278     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6279         && !IS_INTERLACED(mb_type)
6280         && IS_INTERLACED(mbm_type)
6281         ) {
6282         // This is a special case in the norm where the filtering must
6283         // be done twice (one each of the field) even if we are in a
6284         // frame macroblock.
6285         //
6286         static const int nnz_idx[4] = {4,5,6,3};
6287         unsigned int tmp_linesize   = 2 *   linesize;
6288         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6289         int mbn_xy = mb_xy - 2 * s->mb_stride;
6290         int qp;
6291         int i, j;
6292         int16_t bS[4];
6293
6294         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6295             if( IS_INTRA(mb_type) ||
6296                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6297                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6298             } else {
6299                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6300                 for( i = 0; i < 4; i++ ) {
6301                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6302                         mbn_nnz[nnz_idx[i]] != 0 )
6303                         bS[i] = 2;
6304                     else
6305                         bS[i] = 1;
6306                 }
6307             }
6308             // Do not use s->qscale as luma quantizer because it has not the same
6309             // value in IPCM macroblocks.
6310             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6311             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6312             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6313             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6314             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6315                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6316             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6317                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6318         }
6319
6320         start = 1;
6321     }
6322
6323     /* Calculate bS */
6324     for( edge = start; edge < edges; edge++ ) {
6325         /* mbn_xy: neighbor macroblock */
6326         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6327         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6328         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6329         int16_t bS[4];
6330         int qp;
6331
6332         if( (edge&1) && IS_8x8DCT(mb_type) )
6333             continue;
6334
6335         if( IS_INTRA(mb_type) ||
6336             IS_INTRA(mbn_type) ) {
6337             int value;
6338             if (edge == 0) {
6339                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6340                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6341                 ) {
6342                     value = 4;
6343                 } else {
6344                     value = 3;
6345                 }
6346             } else {
6347                 value = 3;
6348             }
6349             bS[0] = bS[1] = bS[2] = bS[3] = value;
6350         } else {
6351             int i, l;
6352             int mv_done;
6353
6354             if( edge & mask_edge ) {
6355                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6356                 mv_done = 1;
6357             }
6358             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6359                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6360                 mv_done = 1;
6361             }
6362             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6363                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6364                 int bn_idx= b_idx - (dir ? 8:1);
6365                 int v = 0;
6366
6367                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6368                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6369                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6370                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6371                 }
6372
6373                 if(h->slice_type_nos == FF_B_TYPE && v){
6374                     v=0;
6375                     for( l = 0; !v && l < 2; l++ ) {
6376                         int ln= 1-l;
6377                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6378                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6379                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6380                     }
6381                 }
6382
6383                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6384                 mv_done = 1;
6385             }
6386             else
6387                 mv_done = 0;
6388
6389             for( i = 0; i < 4; i++ ) {
6390                 int x = dir == 0 ? edge : i;
6391                 int y = dir == 0 ? i    : edge;
6392                 int b_idx= 8 + 4 + x + 8*y;
6393                 int bn_idx= b_idx - (dir ? 8:1);
6394
6395                 if( h->non_zero_count_cache[b_idx] |
6396                     h->non_zero_count_cache[bn_idx] ) {
6397                     bS[i] = 2;
6398                 }
6399                 else if(!mv_done)
6400                 {
6401                     bS[i] = 0;
6402                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6403                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6404                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6405                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6406                             bS[i] = 1;
6407                             break;
6408                         }
6409                     }
6410
6411                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6412                         bS[i] = 0;
6413                         for( l = 0; l < 2; l++ ) {
6414                             int ln= 1-l;
6415                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6416                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6417                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6418                                 bS[i] = 1;
6419                                 break;
6420                             }
6421                         }
6422                     }
6423                 }
6424             }
6425
6426             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6427                 continue;
6428         }
6429
6430         /* Filter edge */
6431         // Do not use s->qscale as luma quantizer because it has not the same
6432         // value in IPCM macroblocks.
6433         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6434         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6435         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6436         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6437         if( dir == 0 ) {
6438             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6439             if( (edge&1) == 0 ) {
6440                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6441                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6442                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6443                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6444             }
6445         } else {
6446             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6447             if( (edge&1) == 0 ) {
6448                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6449                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6450                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6451                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6452             }
6453         }
6454     }
6455 }
6456
6457 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6458     MpegEncContext * const s = &h->s;
6459     const int mb_xy= mb_x + mb_y*s->mb_stride;
6460     const int mb_type = s->current_picture.mb_type[mb_xy];
6461     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6462     int first_vertical_edge_done = 0;
6463     int dir;
6464
6465     //for sufficiently low qp, filtering wouldn't do anything
6466     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6467     if(!FRAME_MBAFF){
6468         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6469         int qp = s->current_picture.qscale_table[mb_xy];
6470         if(qp <= qp_thresh
6471            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6472            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6473             return;
6474         }
6475     }
6476
6477     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6478     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6479         int top_type, left_type[2];
6480         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6481         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6482         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6483
6484         if(IS_8x8DCT(top_type)){
6485             h->non_zero_count_cache[4+8*0]=
6486             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6487             h->non_zero_count_cache[6+8*0]=
6488             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6489         }
6490         if(IS_8x8DCT(left_type[0])){
6491             h->non_zero_count_cache[3+8*1]=
6492             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6493         }
6494         if(IS_8x8DCT(left_type[1])){
6495             h->non_zero_count_cache[3+8*3]=
6496             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6497         }
6498
6499         if(IS_8x8DCT(mb_type)){
6500             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6501             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6502
6503             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6504             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6505
6506             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6507             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6508
6509             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6510             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6511         }
6512     }
6513
6514     if (FRAME_MBAFF
6515             // left mb is in picture
6516             && h->slice_table[mb_xy-1] != 0xFFFF
6517             // and current and left pair do not have the same interlaced type
6518             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6519             // and left mb is in the same slice if deblocking_filter == 2
6520             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6521         /* First vertical edge is different in MBAFF frames
6522          * There are 8 different bS to compute and 2 different Qp
6523          */
6524         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6525         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6526         int16_t bS[8];
6527         int qp[2];
6528         int bqp[2];
6529         int rqp[2];
6530         int mb_qp, mbn0_qp, mbn1_qp;
6531         int i;
6532         first_vertical_edge_done = 1;
6533
6534         if( IS_INTRA(mb_type) )
6535             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6536         else {
6537             for( i = 0; i < 8; i++ ) {
6538                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6539
6540                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6541                     bS[i] = 4;
6542                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6543                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6544                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6545                                                                        :
6546                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6547                     bS[i] = 2;
6548                 else
6549                     bS[i] = 1;
6550             }
6551         }
6552
6553         mb_qp = s->current_picture.qscale_table[mb_xy];
6554         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6555         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6556         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6557         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6558                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6559         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6560                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6561         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6562         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6563                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6564         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6565                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6566
6567         /* Filter edge */
6568         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6569         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6570         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6571         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6572         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6573     }
6574
6575 #ifdef CONFIG_SMALL
6576     for( dir = 0; dir < 2; dir++ )
6577         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6578 #else
6579     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6580     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6581 #endif
6582 }
6583
6584 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6585     H264Context *h = *(void**)arg;
6586     MpegEncContext * const s = &h->s;
6587     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6588
6589     s->mb_skip_run= -1;
6590
6591     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6592                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6593
6594     if( h->pps.cabac ) {
6595         int i;
6596
6597         /* realign */
6598         align_get_bits( &s->gb );
6599
6600         /* init cabac */
6601         ff_init_cabac_states( &h->cabac);
6602         ff_init_cabac_decoder( &h->cabac,
6603                                s->gb.buffer + get_bits_count(&s->gb)/8,
6604                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6605         /* calculate pre-state */
6606         for( i= 0; i < 460; i++ ) {
6607             int pre;
6608             if( h->slice_type_nos == FF_I_TYPE )
6609                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6610             else
6611                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6612
6613             if( pre <= 63 )
6614                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6615             else
6616                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6617         }
6618
6619         for(;;){
6620 //START_TIMER
6621             int ret = decode_mb_cabac(h);
6622             int eos;
6623 //STOP_TIMER("decode_mb_cabac")
6624
6625             if(ret>=0) hl_decode_mb(h);
6626
6627             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6628                 s->mb_y++;
6629
6630                 ret = decode_mb_cabac(h);
6631
6632                 if(ret>=0) hl_decode_mb(h);
6633                 s->mb_y--;
6634             }
6635             eos = get_cabac_terminate( &h->cabac );
6636
6637             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6638                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6639                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6640                 return -1;
6641             }
6642
6643             if( ++s->mb_x >= s->mb_width ) {
6644                 s->mb_x = 0;
6645                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6646                 ++s->mb_y;
6647                 if(FIELD_OR_MBAFF_PICTURE) {
6648                     ++s->mb_y;
6649                 }
6650             }
6651
6652             if( eos || s->mb_y >= s->mb_height ) {
6653                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6654                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6655                 return 0;
6656             }
6657         }
6658
6659     } else {
6660         for(;;){
6661             int ret = decode_mb_cavlc(h);
6662
6663             if(ret>=0) hl_decode_mb(h);
6664
6665             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6666                 s->mb_y++;
6667                 ret = decode_mb_cavlc(h);
6668
6669                 if(ret>=0) hl_decode_mb(h);
6670                 s->mb_y--;
6671             }
6672
6673             if(ret<0){
6674                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6675                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6676
6677                 return -1;
6678             }
6679
6680             if(++s->mb_x >= s->mb_width){
6681                 s->mb_x=0;
6682                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6683                 ++s->mb_y;
6684                 if(FIELD_OR_MBAFF_PICTURE) {
6685                     ++s->mb_y;
6686                 }
6687                 if(s->mb_y >= s->mb_height){
6688                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6689
6690                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6691                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6692
6693                         return 0;
6694                     }else{
6695                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6696
6697                         return -1;
6698                     }
6699                 }
6700             }
6701
6702             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6703                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6704                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6705                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6706
6707                     return 0;
6708                 }else{
6709                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6710
6711                     return -1;
6712                 }
6713             }
6714         }
6715     }
6716
6717 #if 0
6718     for(;s->mb_y < s->mb_height; s->mb_y++){
6719         for(;s->mb_x < s->mb_width; s->mb_x++){
6720             int ret= decode_mb(h);
6721
6722             hl_decode_mb(h);
6723
6724             if(ret<0){
6725                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6726                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6727
6728                 return -1;
6729             }
6730
6731             if(++s->mb_x >= s->mb_width){
6732                 s->mb_x=0;
6733                 if(++s->mb_y >= s->mb_height){
6734                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6735                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6736
6737                         return 0;
6738                     }else{
6739                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6740
6741                         return -1;
6742                     }
6743                 }
6744             }
6745
6746             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6747                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6748                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6749
6750                     return 0;
6751                 }else{
6752                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6753
6754                     return -1;
6755                 }
6756             }
6757         }
6758         s->mb_x=0;
6759         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6760     }
6761 #endif
6762     return -1; //not reached
6763 }
6764
6765 static int decode_picture_timing(H264Context *h){
6766     MpegEncContext * const s = &h->s;
6767     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6768         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6769         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6770     }
6771     if(h->sps.pic_struct_present_flag){
6772         unsigned int i, num_clock_ts;
6773         h->sei_pic_struct = get_bits(&s->gb, 4);
6774
6775         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6776             return -1;
6777
6778         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6779
6780         for (i = 0 ; i < num_clock_ts ; i++){
6781             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6782                 unsigned int full_timestamp_flag;
6783                 skip_bits(&s->gb, 2);                 /* ct_type */
6784                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6785                 skip_bits(&s->gb, 5);                 /* counting_type */
6786                 full_timestamp_flag = get_bits(&s->gb, 1);
6787                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6788                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6789                 skip_bits(&s->gb, 8);                 /* n_frames */
6790                 if(full_timestamp_flag){
6791                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6792                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6793                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6794                 }else{
6795                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6796                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6797                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6798                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6799                             if(get_bits(&s->gb, 1))   /* hours_flag */
6800                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6801                         }
6802                     }
6803                 }
6804                 if(h->sps.time_offset_length > 0)
6805                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6806             }
6807         }
6808     }
6809     return 0;
6810 }
6811
6812 static int decode_unregistered_user_data(H264Context *h, int size){
6813     MpegEncContext * const s = &h->s;
6814     uint8_t user_data[16+256];
6815     int e, build, i;
6816
6817     if(size<16)
6818         return -1;
6819
6820     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6821         user_data[i]= get_bits(&s->gb, 8);
6822     }
6823
6824     user_data[i]= 0;
6825     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6826     if(e==1 && build>=0)
6827         h->x264_build= build;
6828
6829     if(s->avctx->debug & FF_DEBUG_BUGS)
6830         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6831
6832     for(; i<size; i++)
6833         skip_bits(&s->gb, 8);
6834
6835     return 0;
6836 }
6837
6838 static int decode_sei(H264Context *h){
6839     MpegEncContext * const s = &h->s;
6840
6841     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6842         int size, type;
6843
6844         type=0;
6845         do{
6846             type+= show_bits(&s->gb, 8);
6847         }while(get_bits(&s->gb, 8) == 255);
6848
6849         size=0;
6850         do{
6851             size+= show_bits(&s->gb, 8);
6852         }while(get_bits(&s->gb, 8) == 255);
6853
6854         switch(type){
6855         case 1: // Picture timing SEI
6856             if(decode_picture_timing(h) < 0)
6857                 return -1;
6858             break;
6859         case 5:
6860             if(decode_unregistered_user_data(h, size) < 0)
6861                 return -1;
6862             break;
6863         default:
6864             skip_bits(&s->gb, 8*size);
6865         }
6866
6867         //FIXME check bits here
6868         align_get_bits(&s->gb);
6869     }
6870
6871     return 0;
6872 }
6873
6874 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6875     MpegEncContext * const s = &h->s;
6876     int cpb_count, i;
6877     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6878
6879     if(cpb_count > 32U){
6880         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6881         return -1;
6882     }
6883
6884     get_bits(&s->gb, 4); /* bit_rate_scale */
6885     get_bits(&s->gb, 4); /* cpb_size_scale */
6886     for(i=0; i<cpb_count; i++){
6887         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6888         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6889         get_bits1(&s->gb);     /* cbr_flag */
6890     }
6891     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6892     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6893     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6894     sps->time_offset_length = get_bits(&s->gb, 5);
6895     return 0;
6896 }
6897
6898 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6899     MpegEncContext * const s = &h->s;
6900     int aspect_ratio_info_present_flag;
6901     unsigned int aspect_ratio_idc;
6902
6903     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6904
6905     if( aspect_ratio_info_present_flag ) {
6906         aspect_ratio_idc= get_bits(&s->gb, 8);
6907         if( aspect_ratio_idc == EXTENDED_SAR ) {
6908             sps->sar.num= get_bits(&s->gb, 16);
6909             sps->sar.den= get_bits(&s->gb, 16);
6910         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6911             sps->sar=  pixel_aspect[aspect_ratio_idc];
6912         }else{
6913             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6914             return -1;
6915         }
6916     }else{
6917         sps->sar.num=
6918         sps->sar.den= 0;
6919     }
6920 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6921
6922     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6923         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6924     }
6925
6926     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6927         get_bits(&s->gb, 3);    /* video_format */
6928         get_bits1(&s->gb);      /* video_full_range_flag */
6929         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6930             get_bits(&s->gb, 8); /* colour_primaries */
6931             get_bits(&s->gb, 8); /* transfer_characteristics */
6932             get_bits(&s->gb, 8); /* matrix_coefficients */
6933         }
6934     }
6935
6936     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6937         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6938         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6939     }
6940
6941     sps->timing_info_present_flag = get_bits1(&s->gb);
6942     if(sps->timing_info_present_flag){
6943         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6944         sps->time_scale = get_bits_long(&s->gb, 32);
6945         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6946     }
6947
6948     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6949     if(sps->nal_hrd_parameters_present_flag)
6950         if(decode_hrd_parameters(h, sps) < 0)
6951             return -1;
6952     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6953     if(sps->vcl_hrd_parameters_present_flag)
6954         if(decode_hrd_parameters(h, sps) < 0)
6955             return -1;
6956     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6957         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6958     sps->pic_struct_present_flag = get_bits1(&s->gb);
6959
6960     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6961     if(sps->bitstream_restriction_flag){
6962         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6963         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6964         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6965         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6966         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6967         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6968         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6969
6970         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6971             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6972             return -1;
6973         }
6974     }
6975
6976     return 0;
6977 }
6978
6979 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6980                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6981     MpegEncContext * const s = &h->s;
6982     int i, last = 8, next = 8;
6983     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6984     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6985         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6986     else
6987     for(i=0;i<size;i++){
6988         if(next)
6989             next = (last + get_se_golomb(&s->gb)) & 0xff;
6990         if(!i && !next){ /* matrix not written, we use the preset one */
6991             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6992             break;
6993         }
6994         last = factors[scan[i]] = next ? next : last;
6995     }
6996 }
6997
6998 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6999                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7000     MpegEncContext * const s = &h->s;
7001     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7002     const uint8_t *fallback[4] = {
7003         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7004         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7005         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7006         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7007     };
7008     if(get_bits1(&s->gb)){
7009         sps->scaling_matrix_present |= is_sps;
7010         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7011         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7012         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7013         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7014         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7015         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7016         if(is_sps || pps->transform_8x8_mode){
7017             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7018             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7019         }
7020     }
7021 }
7022
7023 static inline int decode_seq_parameter_set(H264Context *h){
7024     MpegEncContext * const s = &h->s;
7025     int profile_idc, level_idc;
7026     unsigned int sps_id;
7027     int i;
7028     SPS *sps;
7029
7030     profile_idc= get_bits(&s->gb, 8);
7031     get_bits1(&s->gb);   //constraint_set0_flag
7032     get_bits1(&s->gb);   //constraint_set1_flag
7033     get_bits1(&s->gb);   //constraint_set2_flag
7034     get_bits1(&s->gb);   //constraint_set3_flag
7035     get_bits(&s->gb, 4); // reserved
7036     level_idc= get_bits(&s->gb, 8);
7037     sps_id= get_ue_golomb_31(&s->gb);
7038
7039     if(sps_id >= MAX_SPS_COUNT) {
7040         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7041         return -1;
7042     }
7043     sps= av_mallocz(sizeof(SPS));
7044     if(sps == NULL)
7045         return -1;
7046
7047     sps->profile_idc= profile_idc;
7048     sps->level_idc= level_idc;
7049
7050     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7051     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7052     sps->scaling_matrix_present = 0;
7053
7054     if(sps->profile_idc >= 100){ //high profile
7055         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7056         if(sps->chroma_format_idc == 3)
7057             get_bits1(&s->gb);  //residual_color_transform_flag
7058         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7059         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7060         sps->transform_bypass = get_bits1(&s->gb);
7061         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7062     }else{
7063         sps->chroma_format_idc= 1;
7064     }
7065
7066     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7067     sps->poc_type= get_ue_golomb_31(&s->gb);
7068
7069     if(sps->poc_type == 0){ //FIXME #define
7070         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7071     } else if(sps->poc_type == 1){//FIXME #define
7072         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7073         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7074         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7075         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7076
7077         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7078             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7079             goto fail;
7080         }
7081
7082         for(i=0; i<sps->poc_cycle_length; i++)
7083             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7084     }else if(sps->poc_type != 2){
7085         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7086         goto fail;
7087     }
7088
7089     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7090     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7091         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7092         goto fail;
7093     }
7094     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7095     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7096     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7097     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7098        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7099         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7100         goto fail;
7101     }
7102
7103     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7104     if(!sps->frame_mbs_only_flag)
7105         sps->mb_aff= get_bits1(&s->gb);
7106     else
7107         sps->mb_aff= 0;
7108
7109     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7110
7111 #ifndef ALLOW_INTERLACE
7112     if(sps->mb_aff)
7113         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7114 #endif
7115     sps->crop= get_bits1(&s->gb);
7116     if(sps->crop){
7117         sps->crop_left  = get_ue_golomb(&s->gb);
7118         sps->crop_right = get_ue_golomb(&s->gb);
7119         sps->crop_top   = get_ue_golomb(&s->gb);
7120         sps->crop_bottom= get_ue_golomb(&s->gb);
7121         if(sps->crop_left || sps->crop_top){
7122             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7123         }
7124         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7125             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7126         }
7127     }else{
7128         sps->crop_left  =
7129         sps->crop_right =
7130         sps->crop_top   =
7131         sps->crop_bottom= 0;
7132     }
7133
7134     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7135     if( sps->vui_parameters_present_flag )
7136         decode_vui_parameters(h, sps);
7137
7138     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7139         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7140                sps_id, sps->profile_idc, sps->level_idc,
7141                sps->poc_type,
7142                sps->ref_frame_count,
7143                sps->mb_width, sps->mb_height,
7144                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7145                sps->direct_8x8_inference_flag ? "8B8" : "",
7146                sps->crop_left, sps->crop_right,
7147                sps->crop_top, sps->crop_bottom,
7148                sps->vui_parameters_present_flag ? "VUI" : "",
7149                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7150                );
7151     }
7152     av_free(h->sps_buffers[sps_id]);
7153     h->sps_buffers[sps_id]= sps;
7154     return 0;
7155 fail:
7156     av_free(sps);
7157     return -1;
7158 }
7159
7160 static void
7161 build_qp_table(PPS *pps, int t, int index)
7162 {
7163     int i;
7164     for(i = 0; i < 52; i++)
7165         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7166 }
7167
7168 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7169     MpegEncContext * const s = &h->s;
7170     unsigned int pps_id= get_ue_golomb(&s->gb);
7171     PPS *pps;
7172
7173     if(pps_id >= MAX_PPS_COUNT) {
7174         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7175         return -1;
7176     }
7177
7178     pps= av_mallocz(sizeof(PPS));
7179     if(pps == NULL)
7180         return -1;
7181     pps->sps_id= get_ue_golomb_31(&s->gb);
7182     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7183         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7184         goto fail;
7185     }
7186
7187     pps->cabac= get_bits1(&s->gb);
7188     pps->pic_order_present= get_bits1(&s->gb);
7189     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7190     if(pps->slice_group_count > 1 ){
7191         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7192         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7193         switch(pps->mb_slice_group_map_type){
7194         case 0:
7195 #if 0
7196 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7197 |    run_length[ i ]                                |1  |ue(v)   |
7198 #endif
7199             break;
7200         case 2:
7201 #if 0
7202 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7203 |{                                                  |   |        |
7204 |    top_left_mb[ i ]                               |1  |ue(v)   |
7205 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7206 |   }                                               |   |        |
7207 #endif
7208             break;
7209         case 3:
7210         case 4:
7211         case 5:
7212 #if 0
7213 |   slice_group_change_direction_flag               |1  |u(1)    |
7214 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7215 #endif
7216             break;
7217         case 6:
7218 #if 0
7219 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7220 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7221 |)                                                  |   |        |
7222 |    slice_group_id[ i ]                            |1  |u(v)    |
7223 #endif
7224             break;
7225         }
7226     }
7227     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7228     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7229     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7230         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7231         goto fail;
7232     }
7233
7234     pps->weighted_pred= get_bits1(&s->gb);
7235     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7236     pps->init_qp= get_se_golomb(&s->gb) + 26;
7237     pps->init_qs= get_se_golomb(&s->gb) + 26;
7238     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7239     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7240     pps->constrained_intra_pred= get_bits1(&s->gb);
7241     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7242
7243     pps->transform_8x8_mode= 0;
7244     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7245     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7246     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7247
7248     if(get_bits_count(&s->gb) < bit_length){
7249         pps->transform_8x8_mode= get_bits1(&s->gb);
7250         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7251         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7252     } else {
7253         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7254     }
7255
7256     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7257     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7258     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7259         h->pps.chroma_qp_diff= 1;
7260
7261     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7262         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7263                pps_id, pps->sps_id,
7264                pps->cabac ? "CABAC" : "CAVLC",
7265                pps->slice_group_count,
7266                pps->ref_count[0], pps->ref_count[1],
7267                pps->weighted_pred ? "weighted" : "",
7268                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7269                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7270                pps->constrained_intra_pred ? "CONSTR" : "",
7271                pps->redundant_pic_cnt_present ? "REDU" : "",
7272                pps->transform_8x8_mode ? "8x8DCT" : ""
7273                );
7274     }
7275
7276     av_free(h->pps_buffers[pps_id]);
7277     h->pps_buffers[pps_id]= pps;
7278     return 0;
7279 fail:
7280     av_free(pps);
7281     return -1;
7282 }
7283
7284 /**
7285  * Call decode_slice() for each context.
7286  *
7287  * @param h h264 master context
7288  * @param context_count number of contexts to execute
7289  */
7290 static void execute_decode_slices(H264Context *h, int context_count){
7291     MpegEncContext * const s = &h->s;
7292     AVCodecContext * const avctx= s->avctx;
7293     H264Context *hx;
7294     int i;
7295
7296     if(avctx->codec_id == CODEC_ID_H264_VDPAU)
7297         return;
7298     if(context_count == 1) {
7299         decode_slice(avctx, &h);
7300     } else {
7301         for(i = 1; i < context_count; i++) {
7302             hx = h->thread_context[i];
7303             hx->s.error_recognition = avctx->error_recognition;
7304             hx->s.error_count = 0;
7305         }
7306
7307         avctx->execute(avctx, (void *)decode_slice,
7308                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7309
7310         /* pull back stuff from slices to master context */
7311         hx = h->thread_context[context_count - 1];
7312         s->mb_x = hx->s.mb_x;
7313         s->mb_y = hx->s.mb_y;
7314         s->dropable = hx->s.dropable;
7315         s->picture_structure = hx->s.picture_structure;
7316         for(i = 1; i < context_count; i++)
7317             h->s.error_count += h->thread_context[i]->s.error_count;
7318     }
7319 }
7320
7321
7322 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7323     MpegEncContext * const s = &h->s;
7324     AVCodecContext * const avctx= s->avctx;
7325     int buf_index=0;
7326     H264Context *hx; ///< thread context
7327     int context_count = 0;
7328
7329     h->max_contexts = avctx->thread_count;
7330 #if 0
7331     int i;
7332     for(i=0; i<50; i++){
7333         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7334     }
7335 #endif
7336     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7337         h->current_slice = 0;
7338         if (!s->first_field)
7339             s->current_picture_ptr= NULL;
7340     }
7341
7342     for(;;){
7343         int consumed;
7344         int dst_length;
7345         int bit_length;
7346         const uint8_t *ptr;
7347         int i, nalsize = 0;
7348         int err;
7349
7350         if(h->is_avc) {
7351             if(buf_index >= buf_size) break;
7352             nalsize = 0;
7353             for(i = 0; i < h->nal_length_size; i++)
7354                 nalsize = (nalsize << 8) | buf[buf_index++];
7355             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7356                 if(nalsize == 1){
7357                     buf_index++;
7358                     continue;
7359                 }else{
7360                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7361                     break;
7362                 }
7363             }
7364         } else {
7365             // start code prefix search
7366             for(; buf_index + 3 < buf_size; buf_index++){
7367                 // This should always succeed in the first iteration.
7368                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7369                     break;
7370             }
7371
7372             if(buf_index+3 >= buf_size) break;
7373
7374             buf_index+=3;
7375         }
7376
7377         hx = h->thread_context[context_count];
7378
7379         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7380         if (ptr==NULL || dst_length < 0){
7381             return -1;
7382         }
7383         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7384             dst_length--;
7385         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7386
7387         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7388             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7389         }
7390
7391         if (h->is_avc && (nalsize != consumed)){
7392             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7393             consumed= nalsize;
7394         }
7395
7396         buf_index += consumed;
7397
7398         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7399            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7400             continue;
7401
7402       again:
7403         err = 0;
7404         switch(hx->nal_unit_type){
7405         case NAL_IDR_SLICE:
7406             if (h->nal_unit_type != NAL_IDR_SLICE) {
7407                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7408                 return -1;
7409             }
7410             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7411         case NAL_SLICE:
7412             init_get_bits(&hx->s.gb, ptr, bit_length);
7413             hx->intra_gb_ptr=
7414             hx->inter_gb_ptr= &hx->s.gb;
7415             hx->s.data_partitioning = 0;
7416
7417             if((err = decode_slice_header(hx, h)))
7418                break;
7419
7420             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7421             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7422                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7423                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7424                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7425                && avctx->skip_frame < AVDISCARD_ALL){
7426                 if(ENABLE_H264_VDPAU_DECODER && avctx->codec_id == CODEC_ID_H264_VDPAU){
7427                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7428                     ff_vdpau_h264_add_data_chunk(h, start_code, sizeof(start_code));
7429                     ff_vdpau_h264_add_data_chunk(h, &buf[buf_index - consumed], consumed );
7430                 }else
7431                     context_count++;
7432             }
7433             break;
7434         case NAL_DPA:
7435             init_get_bits(&hx->s.gb, ptr, bit_length);
7436             hx->intra_gb_ptr=
7437             hx->inter_gb_ptr= NULL;
7438             hx->s.data_partitioning = 1;
7439
7440             err = decode_slice_header(hx, h);
7441             break;
7442         case NAL_DPB:
7443             init_get_bits(&hx->intra_gb, ptr, bit_length);
7444             hx->intra_gb_ptr= &hx->intra_gb;
7445             break;
7446         case NAL_DPC:
7447             init_get_bits(&hx->inter_gb, ptr, bit_length);
7448             hx->inter_gb_ptr= &hx->inter_gb;
7449
7450             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7451                && s->context_initialized
7452                && s->hurry_up < 5
7453                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7454                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7455                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7456                && avctx->skip_frame < AVDISCARD_ALL)
7457                 context_count++;
7458             break;
7459         case NAL_SEI:
7460             init_get_bits(&s->gb, ptr, bit_length);
7461             decode_sei(h);
7462             break;
7463         case NAL_SPS:
7464             init_get_bits(&s->gb, ptr, bit_length);
7465             decode_seq_parameter_set(h);
7466
7467             if(s->flags& CODEC_FLAG_LOW_DELAY)
7468                 s->low_delay=1;
7469
7470             if(avctx->has_b_frames < 2)
7471                 avctx->has_b_frames= !s->low_delay;
7472             break;
7473         case NAL_PPS:
7474             init_get_bits(&s->gb, ptr, bit_length);
7475
7476             decode_picture_parameter_set(h, bit_length);
7477
7478             break;
7479         case NAL_AUD:
7480         case NAL_END_SEQUENCE:
7481         case NAL_END_STREAM:
7482         case NAL_FILLER_DATA:
7483         case NAL_SPS_EXT:
7484         case NAL_AUXILIARY_SLICE:
7485             break;
7486         default:
7487             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7488         }
7489
7490         if(context_count == h->max_contexts) {
7491             execute_decode_slices(h, context_count);
7492             context_count = 0;
7493         }
7494
7495         if (err < 0)
7496             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7497         else if(err == 1) {
7498             /* Slice could not be decoded in parallel mode, copy down
7499              * NAL unit stuff to context 0 and restart. Note that
7500              * rbsp_buffer is not transferred, but since we no longer
7501              * run in parallel mode this should not be an issue. */
7502             h->nal_unit_type = hx->nal_unit_type;
7503             h->nal_ref_idc   = hx->nal_ref_idc;
7504             hx = h;
7505             goto again;
7506         }
7507     }
7508     if(context_count)
7509         execute_decode_slices(h, context_count);
7510     return buf_index;
7511 }
7512
7513 /**
7514  * returns the number of bytes consumed for building the current frame
7515  */
7516 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7517         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7518         if(pos+10>buf_size) pos=buf_size; // oops ;)
7519
7520         return pos;
7521 }
7522
7523 static int decode_frame(AVCodecContext *avctx,
7524                              void *data, int *data_size,
7525                              const uint8_t *buf, int buf_size)
7526 {
7527     H264Context *h = avctx->priv_data;
7528     MpegEncContext *s = &h->s;
7529     AVFrame *pict = data;
7530     int buf_index;
7531
7532     s->flags= avctx->flags;
7533     s->flags2= avctx->flags2;
7534
7535    /* end of stream, output what is still in the buffers */
7536     if (buf_size == 0) {
7537         Picture *out;
7538         int i, out_idx;
7539
7540 //FIXME factorize this with the output code below
7541         out = h->delayed_pic[0];
7542         out_idx = 0;
7543         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7544             if(h->delayed_pic[i]->poc < out->poc){
7545                 out = h->delayed_pic[i];
7546                 out_idx = i;
7547             }
7548
7549         for(i=out_idx; h->delayed_pic[i]; i++)
7550             h->delayed_pic[i] = h->delayed_pic[i+1];
7551
7552         if(out){
7553             *data_size = sizeof(AVFrame);
7554             *pict= *(AVFrame*)out;
7555         }
7556
7557         return 0;
7558     }
7559
7560     if(h->is_avc && !h->got_avcC) {
7561         int i, cnt, nalsize;
7562         unsigned char *p = avctx->extradata;
7563         if(avctx->extradata_size < 7) {
7564             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7565             return -1;
7566         }
7567         if(*p != 1) {
7568             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7569             return -1;
7570         }
7571         /* sps and pps in the avcC always have length coded with 2 bytes,
7572            so put a fake nal_length_size = 2 while parsing them */
7573         h->nal_length_size = 2;
7574         // Decode sps from avcC
7575         cnt = *(p+5) & 0x1f; // Number of sps
7576         p += 6;
7577         for (i = 0; i < cnt; i++) {
7578             nalsize = AV_RB16(p) + 2;
7579             if(decode_nal_units(h, p, nalsize) < 0) {
7580                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7581                 return -1;
7582             }
7583             p += nalsize;
7584         }
7585         // Decode pps from avcC
7586         cnt = *(p++); // Number of pps
7587         for (i = 0; i < cnt; i++) {
7588             nalsize = AV_RB16(p) + 2;
7589             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7590                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7591                 return -1;
7592             }
7593             p += nalsize;
7594         }
7595         // Now store right nal length size, that will be use to parse all other nals
7596         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7597         // Do not reparse avcC
7598         h->got_avcC = 1;
7599     }
7600
7601     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7602         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7603             return -1;
7604         h->got_avcC = 1;
7605     }
7606
7607     buf_index=decode_nal_units(h, buf, buf_size);
7608     if(buf_index < 0)
7609         return -1;
7610
7611     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7612         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7613         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7614         return -1;
7615     }
7616
7617     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7618         Picture *out = s->current_picture_ptr;
7619         Picture *cur = s->current_picture_ptr;
7620         int i, pics, cross_idr, out_of_order, out_idx;
7621
7622         s->mb_y= 0;
7623
7624         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7625         s->current_picture_ptr->pict_type= s->pict_type;
7626
7627         if (ENABLE_H264_VDPAU_DECODER && avctx->codec_id == CODEC_ID_H264_VDPAU)
7628             ff_vdpau_h264_set_reference_frames(h);
7629
7630         if(!s->dropable) {
7631             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7632             h->prev_poc_msb= h->poc_msb;
7633             h->prev_poc_lsb= h->poc_lsb;
7634         }
7635         h->prev_frame_num_offset= h->frame_num_offset;
7636         h->prev_frame_num= h->frame_num;
7637
7638         if (ENABLE_H264_VDPAU_DECODER && avctx->codec_id == CODEC_ID_H264_VDPAU)
7639             ff_vdpau_h264_picture_complete(h);
7640
7641         /*
7642          * FIXME: Error handling code does not seem to support interlaced
7643          * when slices span multiple rows
7644          * The ff_er_add_slice calls don't work right for bottom
7645          * fields; they cause massive erroneous error concealing
7646          * Error marking covers both fields (top and bottom).
7647          * This causes a mismatched s->error_count
7648          * and a bad error table. Further, the error count goes to
7649          * INT_MAX when called for bottom field, because mb_y is
7650          * past end by one (callers fault) and resync_mb_y != 0
7651          * causes problems for the first MB line, too.
7652          */
7653         if (!FIELD_PICTURE)
7654             ff_er_frame_end(s);
7655
7656         MPV_frame_end(s);
7657
7658         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7659             /* Wait for second field. */
7660             *data_size = 0;
7661
7662         } else {
7663             cur->repeat_pict = 0;
7664
7665             /* Signal interlacing information externally. */
7666             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7667             if(h->sps.pic_struct_present_flag){
7668                 switch (h->sei_pic_struct)
7669                 {
7670                 case SEI_PIC_STRUCT_FRAME:
7671                     cur->interlaced_frame = 0;
7672                     break;
7673                 case SEI_PIC_STRUCT_TOP_FIELD:
7674                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7675                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7676                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7677                     cur->interlaced_frame = 1;
7678                     break;
7679                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7680                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7681                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7682                     // From these hints, let the applications decide if they apply deinterlacing.
7683                     cur->repeat_pict = 1;
7684                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7685                     break;
7686                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7687                     // Force progressive here, as doubling interlaced frame is a bad idea.
7688                     cur->interlaced_frame = 0;
7689                     cur->repeat_pict = 2;
7690                     break;
7691                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7692                     cur->interlaced_frame = 0;
7693                     cur->repeat_pict = 4;
7694                     break;
7695                 }
7696             }else{
7697                 /* Derive interlacing flag from used decoding process. */
7698                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7699             }
7700
7701             if (cur->field_poc[0] != cur->field_poc[1]){
7702                 /* Derive top_field_first from field pocs. */
7703                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7704             }else{
7705                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7706                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7707                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7708                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7709                         cur->top_field_first = 1;
7710                     else
7711                         cur->top_field_first = 0;
7712                 }else{
7713                     /* Most likely progressive */
7714                     cur->top_field_first = 0;
7715                 }
7716             }
7717
7718         //FIXME do something with unavailable reference frames
7719
7720             /* Sort B-frames into display order */
7721
7722             if(h->sps.bitstream_restriction_flag
7723                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7724                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7725                 s->low_delay = 0;
7726             }
7727
7728             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7729                && !h->sps.bitstream_restriction_flag){
7730                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7731                 s->low_delay= 0;
7732             }
7733
7734             pics = 0;
7735             while(h->delayed_pic[pics]) pics++;
7736
7737             assert(pics <= MAX_DELAYED_PIC_COUNT);
7738
7739             h->delayed_pic[pics++] = cur;
7740             if(cur->reference == 0)
7741                 cur->reference = DELAYED_PIC_REF;
7742
7743             out = h->delayed_pic[0];
7744             out_idx = 0;
7745             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7746                 if(h->delayed_pic[i]->poc < out->poc){
7747                     out = h->delayed_pic[i];
7748                     out_idx = i;
7749                 }
7750             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7751
7752             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7753
7754             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7755                 { }
7756             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7757                || (s->low_delay &&
7758                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7759                  || cur->pict_type == FF_B_TYPE)))
7760             {
7761                 s->low_delay = 0;
7762                 s->avctx->has_b_frames++;
7763             }
7764
7765             if(out_of_order || pics > s->avctx->has_b_frames){
7766                 out->reference &= ~DELAYED_PIC_REF;
7767                 for(i=out_idx; h->delayed_pic[i]; i++)
7768                     h->delayed_pic[i] = h->delayed_pic[i+1];
7769             }
7770             if(!out_of_order && pics > s->avctx->has_b_frames){
7771                 *data_size = sizeof(AVFrame);
7772
7773                 h->outputed_poc = out->poc;
7774                 *pict= *(AVFrame*)out;
7775             }else{
7776                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7777             }
7778         }
7779     }
7780
7781     assert(pict->data[0] || !*data_size);
7782     ff_print_debug_info(s, pict);
7783 //printf("out %d\n", (int)pict->data[0]);
7784 #if 0 //?
7785
7786     /* Return the Picture timestamp as the frame number */
7787     /* we subtract 1 because it is added on utils.c     */
7788     avctx->frame_number = s->picture_number - 1;
7789 #endif
7790     return get_consumed_bytes(s, buf_index, buf_size);
7791 }
7792 #if 0
7793 static inline void fill_mb_avail(H264Context *h){
7794     MpegEncContext * const s = &h->s;
7795     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7796
7797     if(s->mb_y){
7798         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7799         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7800         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7801     }else{
7802         h->mb_avail[0]=
7803         h->mb_avail[1]=
7804         h->mb_avail[2]= 0;
7805     }
7806     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7807     h->mb_avail[4]= 1; //FIXME move out
7808     h->mb_avail[5]= 0; //FIXME move out
7809 }
7810 #endif
7811
7812 #ifdef TEST
7813 #undef printf
7814 #undef random
7815 #define COUNT 8000
7816 #define SIZE (COUNT*40)
7817 int main(void){
7818     int i;
7819     uint8_t temp[SIZE];
7820     PutBitContext pb;
7821     GetBitContext gb;
7822 //    int int_temp[10000];
7823     DSPContext dsp;
7824     AVCodecContext avctx;
7825
7826     dsputil_init(&dsp, &avctx);
7827
7828     init_put_bits(&pb, temp, SIZE);
7829     printf("testing unsigned exp golomb\n");
7830     for(i=0; i<COUNT; i++){
7831         START_TIMER
7832         set_ue_golomb(&pb, i);
7833         STOP_TIMER("set_ue_golomb");
7834     }
7835     flush_put_bits(&pb);
7836
7837     init_get_bits(&gb, temp, 8*SIZE);
7838     for(i=0; i<COUNT; i++){
7839         int j, s;
7840
7841         s= show_bits(&gb, 24);
7842
7843         START_TIMER
7844         j= get_ue_golomb(&gb);
7845         if(j != i){
7846             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7847 //            return -1;
7848         }
7849         STOP_TIMER("get_ue_golomb");
7850     }
7851
7852
7853     init_put_bits(&pb, temp, SIZE);
7854     printf("testing signed exp golomb\n");
7855     for(i=0; i<COUNT; i++){
7856         START_TIMER
7857         set_se_golomb(&pb, i - COUNT/2);
7858         STOP_TIMER("set_se_golomb");
7859     }
7860     flush_put_bits(&pb);
7861
7862     init_get_bits(&gb, temp, 8*SIZE);
7863     for(i=0; i<COUNT; i++){
7864         int j, s;
7865
7866         s= show_bits(&gb, 24);
7867
7868         START_TIMER
7869         j= get_se_golomb(&gb);
7870         if(j != i - COUNT/2){
7871             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7872 //            return -1;
7873         }
7874         STOP_TIMER("get_se_golomb");
7875     }
7876
7877 #if 0
7878     printf("testing 4x4 (I)DCT\n");
7879
7880     DCTELEM block[16];
7881     uint8_t src[16], ref[16];
7882     uint64_t error= 0, max_error=0;
7883
7884     for(i=0; i<COUNT; i++){
7885         int j;
7886 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7887         for(j=0; j<16; j++){
7888             ref[j]= random()%255;
7889             src[j]= random()%255;
7890         }
7891
7892         h264_diff_dct_c(block, src, ref, 4);
7893
7894         //normalize
7895         for(j=0; j<16; j++){
7896 //            printf("%d ", block[j]);
7897             block[j]= block[j]*4;
7898             if(j&1) block[j]= (block[j]*4 + 2)/5;
7899             if(j&4) block[j]= (block[j]*4 + 2)/5;
7900         }
7901 //        printf("\n");
7902
7903         s->dsp.h264_idct_add(ref, block, 4);
7904 /*        for(j=0; j<16; j++){
7905             printf("%d ", ref[j]);
7906         }
7907         printf("\n");*/
7908
7909         for(j=0; j<16; j++){
7910             int diff= FFABS(src[j] - ref[j]);
7911
7912             error+= diff*diff;
7913             max_error= FFMAX(max_error, diff);
7914         }
7915     }
7916     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7917     printf("testing quantizer\n");
7918     for(qp=0; qp<52; qp++){
7919         for(i=0; i<16; i++)
7920             src1_block[i]= src2_block[i]= random()%255;
7921
7922     }
7923     printf("Testing NAL layer\n");
7924
7925     uint8_t bitstream[COUNT];
7926     uint8_t nal[COUNT*2];
7927     H264Context h;
7928     memset(&h, 0, sizeof(H264Context));
7929
7930     for(i=0; i<COUNT; i++){
7931         int zeros= i;
7932         int nal_length;
7933         int consumed;
7934         int out_length;
7935         uint8_t *out;
7936         int j;
7937
7938         for(j=0; j<COUNT; j++){
7939             bitstream[j]= (random() % 255) + 1;
7940         }
7941
7942         for(j=0; j<zeros; j++){
7943             int pos= random() % COUNT;
7944             while(bitstream[pos] == 0){
7945                 pos++;
7946                 pos %= COUNT;
7947             }
7948             bitstream[pos]=0;
7949         }
7950
7951         START_TIMER
7952
7953         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7954         if(nal_length<0){
7955             printf("encoding failed\n");
7956             return -1;
7957         }
7958
7959         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7960
7961         STOP_TIMER("NAL")
7962
7963         if(out_length != COUNT){
7964             printf("incorrect length %d %d\n", out_length, COUNT);
7965             return -1;
7966         }
7967
7968         if(consumed != nal_length){
7969             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7970             return -1;
7971         }
7972
7973         if(memcmp(bitstream, out, COUNT)){
7974             printf("mismatch\n");
7975             return -1;
7976         }
7977     }
7978 #endif
7979
7980     printf("Testing RBSP\n");
7981
7982
7983     return 0;
7984 }
7985 #endif /* TEST */
7986
7987
7988 static av_cold int decode_end(AVCodecContext *avctx)
7989 {
7990     H264Context *h = avctx->priv_data;
7991     MpegEncContext *s = &h->s;
7992     int i;
7993
7994     av_freep(&h->rbsp_buffer[0]);
7995     av_freep(&h->rbsp_buffer[1]);
7996     free_tables(h); //FIXME cleanup init stuff perhaps
7997
7998     for(i = 0; i < MAX_SPS_COUNT; i++)
7999         av_freep(h->sps_buffers + i);
8000
8001     for(i = 0; i < MAX_PPS_COUNT; i++)
8002         av_freep(h->pps_buffers + i);
8003
8004     MPV_common_end(s);
8005
8006 //    memset(h, 0, sizeof(H264Context));
8007
8008     return 0;
8009 }
8010
8011
8012 AVCodec h264_decoder = {
8013     "h264",
8014     CODEC_TYPE_VIDEO,
8015     CODEC_ID_H264,
8016     sizeof(H264Context),
8017     decode_init,
8018     NULL,
8019     decode_end,
8020     decode_frame,
8021     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8022     .flush= flush_dpb,
8023     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8024 };
8025
8026 #ifdef CONFIG_H264_VDPAU_DECODER
8027 AVCodec h264_vdpau_decoder = {
8028     "h264_vdpau",
8029     CODEC_TYPE_VIDEO,
8030     CODEC_ID_H264_VDPAU,
8031     sizeof(H264Context),
8032     decode_init,
8033     NULL,
8034     decode_end,
8035     decode_frame,
8036     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8037     .flush= flush_dpb,
8038     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8039 };
8040 #endif
8041
8042 #ifdef CONFIG_SVQ3_DECODER
8043 #include "svq3.c"
8044 #endif