git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "mathops.h"
  36 #include "rectangle.h"
  37 #include "vdpau_internal.h"
  38
  39 #include "cabac.h"
  40 #if ARCH_X86
  41 #include "x86/h264_i386.h"
  42 #endif
  43
  44 //#undef NDEBUG
  45 #include <assert.h>
  46
  47 /**
  48  * Value of Picture.reference when Picture is not a reference picture, but
  49  * is held for delayed output.
  50  */
  51 #define DELAYED_PIC_REF 4
  52
  53 static VLC coeff_token_vlc[4];
  54 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  55 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  56
  57 static VLC chroma_dc_coeff_token_vlc;
  58 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  59 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  60
  61 static VLC total_zeros_vlc[15];
  62 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  63 static const int total_zeros_vlc_tables_size = 512;
  64
  65 static VLC chroma_dc_total_zeros_vlc[3];
  66 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  67 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  68
  69 static VLC run_vlc[6];
  70 static VLC_TYPE run_vlc_tables[6][8][2];
  71 static const int run_vlc_tables_size = 8;
  72
  73 static VLC run7_vlc;
  74 static VLC_TYPE run7_vlc_table[96][2];
  75 static const int run7_vlc_table_size = 96;
  76
  77 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  78 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  79 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  80 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  82
  83 static av_always_inline uint32_t pack16to32(int a, int b){
  84 #ifdef WORDS_BIGENDIAN
  85    return (b&0xFFFF) + (a<<16);
  86 #else
  87    return (a&0xFFFF) + (b<<16);
  88 #endif
  89 }
  90
  91 static const uint8_t rem6[52]={
  92 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  93 };
  94
  95 static const uint8_t div6[52]={
  96 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  97 };
  98
  99 static const uint8_t left_block_options[4][8]={
 100     {0,1,2,3,7,10,8,11},
 101     {2,2,3,3,8,11,8,11},
 102     {0,0,1,1,7,10,7,10},
 103     {0,2,0,2,7,10,7,10}
 104 };
 105
 106 #define LEVEL_TAB_BITS 8
 107 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 108
 109 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 110     MpegEncContext * const s = &h->s;
 111     const int mb_xy= h->mb_xy;
 112     int topleft_xy, top_xy, topright_xy, left_xy[2];
 113     int topleft_type, top_type, topright_type, left_type[2];
 114     const uint8_t * left_block;
 115     int topleft_partition= -1;
 116     int i;
 117
 118     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 119
 120     //FIXME deblocking could skip the intra and nnz parts.
 121     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 122         return;
 123
 124     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 125      * stuff, I can't imagine that these complex rules are worth it. */
 126
 127     topleft_xy = top_xy - 1;
 128     topright_xy= top_xy + 1;
 129     left_xy[1] = left_xy[0] = mb_xy-1;
 130     left_block = left_block_options[0];
 131     if(FRAME_MBAFF){
 132         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 133         const int top_pair_xy      = pair_xy     - s->mb_stride;
 134         const int topleft_pair_xy  = top_pair_xy - 1;
 135         const int topright_pair_xy = top_pair_xy + 1;
 136         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 137         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 138         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 139         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 140         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 141         const int bottom = (s->mb_y & 1);
 142         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 143
 144         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 145             top_xy -= s->mb_stride;
 146         }
 147         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 155             topright_xy -= s->mb_stride;
 156         }
 157         if (left_mb_field_flag != curr_mb_field_flag) {
 158             left_xy[1] = left_xy[0] = pair_xy - 1;
 159             if (curr_mb_field_flag) {
 160                 left_xy[1] += s->mb_stride;
 161                 left_block = left_block_options[3];
 162             } else {
 163                 left_block= left_block_options[2 - bottom];
 164             }
 165         }
 166     }
 167
 168     h->top_mb_xy = top_xy;
 169     h->left_mb_xy[0] = left_xy[0];
 170     h->left_mb_xy[1] = left_xy[1];
 171     if(for_deblock){
 172         topleft_type = 0;
 173         topright_type = 0;
 174         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 175         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 176         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 177
 178         if(MB_MBAFF && !IS_INTRA(mb_type)){
 179             int list;
 180             for(list=0; list<h->list_count; list++){
 181                 //These values where changed for ease of performing MC, we need to change them back
 182                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 183                 //the MC code from changing ref_cache and rather use a temporary array.
 184                 if(USES_LIST(mb_type,list)){
 185                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 186                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 188                     ref += h->b8_stride;
 189                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 190                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 191                 }
 192             }
 193         }
 194     }else{
 195         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 196         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 197         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 198         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 199         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 200
 201     if(IS_INTRA(mb_type)){
 202         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 203         h->topleft_samples_available=
 204         h->top_samples_available=
 205         h->left_samples_available= 0xFFFF;
 206         h->topright_samples_available= 0xEEEA;
 207
 208         if(!(top_type & type_mask)){
 209             h->topleft_samples_available= 0xB3FF;
 210             h->top_samples_available= 0x33FF;
 211             h->topright_samples_available= 0x26EA;
 212         }
 213         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 214             if(IS_INTERLACED(mb_type)){
 215                 if(!(left_type[0] & type_mask)){
 216                     h->topleft_samples_available&= 0xDFFF;
 217                     h->left_samples_available&= 0x5FFF;
 218                 }
 219                 if(!(left_type[1] & type_mask)){
 220                     h->topleft_samples_available&= 0xFF5F;
 221                     h->left_samples_available&= 0xFF5F;
 222                 }
 223             }else{
 224                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 225                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 226                 assert(left_xy[0] == left_xy[1]);
 227                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 228                     h->topleft_samples_available&= 0xDF5F;
 229                     h->left_samples_available&= 0x5F5F;
 230                 }
 231             }
 232         }else{
 233             if(!(left_type[0] & type_mask)){
 234                 h->topleft_samples_available&= 0xDF5F;
 235                 h->left_samples_available&= 0x5F5F;
 236             }
 237         }
 238
 239         if(!(topleft_type & type_mask))
 240             h->topleft_samples_available&= 0x7FFF;
 241
 242         if(!(topright_type & type_mask))
 243             h->topright_samples_available&= 0xFBFF;
 244
 245         if(IS_INTRA4x4(mb_type)){
 246             if(IS_INTRA4x4(top_type)){
 247                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 248                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 249                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 250                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 251             }else{
 252                 int pred;
 253                 if(!(top_type & type_mask))
 254                     pred= -1;
 255                 else{
 256                     pred= 2;
 257                 }
 258                 h->intra4x4_pred_mode_cache[4+8*0]=
 259                 h->intra4x4_pred_mode_cache[5+8*0]=
 260                 h->intra4x4_pred_mode_cache[6+8*0]=
 261                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 262             }
 263             for(i=0; i<2; i++){
 264                 if(IS_INTRA4x4(left_type[i])){
 265                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 266                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 267                 }else{
 268                     int pred;
 269                     if(!(left_type[i] & type_mask))
 270                         pred= -1;
 271                     else{
 272                         pred= 2;
 273                     }
 274                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 275                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 276                 }
 277             }
 278         }
 279     }
 280     }
 281
 282
 283 /*
 284 0 . T T. T T T T
 285 1 L . .L . . . .
 286 2 L . .L . . . .
 287 3 . T TL . . . .
 288 4 L . .L . . . .
 289 5 L . .. . . . .
 290 */
 291 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 292     if(top_type){
 293         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 294         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 295         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 296         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 297
 298         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 299         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 300
 301         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 302         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 303
 304     }else{
 305         h->non_zero_count_cache[4+8*0]=
 306         h->non_zero_count_cache[5+8*0]=
 307         h->non_zero_count_cache[6+8*0]=
 308         h->non_zero_count_cache[7+8*0]=
 309
 310         h->non_zero_count_cache[1+8*0]=
 311         h->non_zero_count_cache[2+8*0]=
 312
 313         h->non_zero_count_cache[1+8*3]=
 314         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 315
 316     }
 317
 318     for (i=0; i<2; i++) {
 319         if(left_type[i]){
 320             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 321             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 322             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 323             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 324         }else{
 325             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 326             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 327             h->non_zero_count_cache[0+8*1 +   8*i]=
 328             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 329         }
 330     }
 331
 332     if( h->pps.cabac ) {
 333         // top_cbp
 334         if(top_type) {
 335             h->top_cbp = h->cbp_table[top_xy];
 336         } else if(IS_INTRA(mb_type)) {
 337             h->top_cbp = 0x1C0;
 338         } else {
 339             h->top_cbp = 0;
 340         }
 341         // left_cbp
 342         if (left_type[0]) {
 343             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 344         } else if(IS_INTRA(mb_type)) {
 345             h->left_cbp = 0x1C0;
 346         } else {
 347             h->left_cbp = 0;
 348         }
 349         if (left_type[0]) {
 350             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 351         }
 352         if (left_type[1]) {
 353             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 354         }
 355     }
 356
 357 #if 1
 358     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 359         int list;
 360         for(list=0; list<h->list_count; list++){
 361             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 362                 /*if(!h->mv_cache_clean[list]){
 363                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 364                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 365                     h->mv_cache_clean[list]= 1;
 366                 }*/
 367                 continue;
 368             }
 369             h->mv_cache_clean[list]= 0;
 370
 371             if(USES_LIST(top_type, list)){
 372                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 373                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 378                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 379                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 380                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 381                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 382             }else{
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 387                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 388             }
 389
 390             for(i=0; i<2; i++){
 391                 int cache_idx = scan8[0] - 1 + i*2*8;
 392                 if(USES_LIST(left_type[i], list)){
 393                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 394                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 395                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 396                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 397                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 398                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 399                 }else{
 400                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 401                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 402                     h->ref_cache[list][cache_idx  ]=
 403                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 404                 }
 405             }
 406
 407             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 408                 continue;
 409
 410             if(USES_LIST(topleft_type, list)){
 411                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 412                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 413                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 414                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 415             }else{
 416                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 417                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 418             }
 419
 420             if(USES_LIST(topright_type, list)){
 421                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 422                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 424                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 425             }else{
 426                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 427                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 428             }
 429
 430             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 431                 continue;
 432
 433             h->ref_cache[list][scan8[5 ]+1] =
 434             h->ref_cache[list][scan8[7 ]+1] =
 435             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 436             h->ref_cache[list][scan8[4 ]] =
 437             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 438             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 439             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 441             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 442             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 443
 444             if( h->pps.cabac ) {
 445                 /* XXX beurk, Load mvd */
 446                 if(USES_LIST(top_type, list)){
 447                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 452                 }else{
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 457                 }
 458                 if(USES_LIST(left_type[0], list)){
 459                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 460                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 462                 }else{
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 465                 }
 466                 if(USES_LIST(left_type[1], list)){
 467                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 470                 }else{
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 473                 }
 474                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 475                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 477                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 478                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 479
 480                 if(h->slice_type_nos == FF_B_TYPE){
 481                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 482
 483                     if(IS_DIRECT(top_type)){
 484                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 485                     }else if(IS_8X8(top_type)){
 486                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 487                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 488                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 489                     }else{
 490                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 491                     }
 492
 493                     if(IS_DIRECT(left_type[0]))
 494                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 495                     else if(IS_8X8(left_type[0]))
 496                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 497                     else
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 499
 500                     if(IS_DIRECT(left_type[1]))
 501                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 502                     else if(IS_8X8(left_type[1]))
 503                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 506                 }
 507             }
 508
 509             if(FRAME_MBAFF){
 510 #define MAP_MVS\
 511                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 512                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 517                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 518                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 520                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 521                 if(MB_FIELD){
 522 #define MAP_F2F(idx, mb_type)\
 523                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 524                         h->ref_cache[list][idx] <<= 1;\
 525                         h->mv_cache[list][idx][1] /= 2;\
 526                         h->mvd_cache[list][idx][1] /= 2;\
 527                     }
 528                     MAP_MVS
 529 #undef MAP_F2F
 530                 }else{
 531 #define MAP_F2F(idx, mb_type)\
 532                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 533                         h->ref_cache[list][idx] >>= 1;\
 534                         h->mv_cache[list][idx][1] <<= 1;\
 535                         h->mvd_cache[list][idx][1] <<= 1;\
 536                     }
 537                     MAP_MVS
 538 #undef MAP_F2F
 539                 }
 540             }
 541         }
 542     }
 543 #endif
 544
 545     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 546 }
 547
 548 static inline void write_back_intra_pred_mode(H264Context *h){
 549     const int mb_xy= h->mb_xy;
 550
 551     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 552     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 553     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 554     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 555     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 556     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 557     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 558 }
 559
 560 /**
 561  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 562  */
 563 static inline int check_intra4x4_pred_mode(H264Context *h){
 564     MpegEncContext * const s = &h->s;
 565     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 566     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 567     int i;
 568
 569     if(!(h->top_samples_available&0x8000)){
 570         for(i=0; i<4; i++){
 571             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 572             if(status<0){
 573                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 574                 return -1;
 575             } else if(status){
 576                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 577             }
 578         }
 579     }
 580
 581     if((h->left_samples_available&0x8888)!=0x8888){
 582         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 583         for(i=0; i<4; i++){
 584             if(!(h->left_samples_available&mask[i])){
 585                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 586                 if(status<0){
 587                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 588                     return -1;
 589                 } else if(status){
 590                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 591                 }
 592             }
 593         }
 594     }
 595
 596     return 0;
 597 } //FIXME cleanup like next
 598
 599 /**
 600  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 601  */
 602 static inline int check_intra_pred_mode(H264Context *h, int mode){
 603     MpegEncContext * const s = &h->s;
 604     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 605     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 606
 607     if(mode > 6U) {
 608         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 609         return -1;
 610     }
 611
 612     if(!(h->top_samples_available&0x8000)){
 613         mode= top[ mode ];
 614         if(mode<0){
 615             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 616             return -1;
 617         }
 618     }
 619
 620     if((h->left_samples_available&0x8080) != 0x8080){
 621         mode= left[ mode ];
 622         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 623             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 624         }
 625         if(mode<0){
 626             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 627             return -1;
 628         }
 629     }
 630
 631     return mode;
 632 }
 633
 634 /**
 635  * gets the predicted intra4x4 prediction mode.
 636  */
 637 static inline int pred_intra_mode(H264Context *h, int n){
 638     const int index8= scan8[n];
 639     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 640     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 641     const int min= FFMIN(left, top);
 642
 643     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 644
 645     if(min<0) return DC_PRED;
 646     else      return min;
 647 }
 648
 649 static inline void write_back_non_zero_count(H264Context *h){
 650     const int mb_xy= h->mb_xy;
 651
 652     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 653     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 654     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 655     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 656     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 657     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 658     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 659
 660     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 661     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 662     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 663
 664     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 665     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 666     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 667 }
 668
 669 /**
 670  * gets the predicted number of non-zero coefficients.
 671  * @param n block index
 672  */
 673 static inline int pred_non_zero_count(H264Context *h, int n){
 674     const int index8= scan8[n];
 675     const int left= h->non_zero_count_cache[index8 - 1];
 676     const int top = h->non_zero_count_cache[index8 - 8];
 677     int i= left + top;
 678
 679     if(i<64) i= (i+1)>>1;
 680
 681     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 682
 683     return i&31;
 684 }
 685
 686 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 687     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 688     MpegEncContext *s = &h->s;
 689
 690     /* there is no consistent mapping of mvs to neighboring locations that will
 691      * make mbaff happy, so we can't move all this logic to fill_caches */
 692     if(FRAME_MBAFF){
 693         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 694         const int16_t *mv;
 695         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 696         *C = h->mv_cache[list][scan8[0]-2];
 697
 698         if(!MB_FIELD
 699            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 700             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 701             if(IS_INTERLACED(mb_types[topright_xy])){
 702 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 703                 const int x4 = X4, y4 = Y4;\
 704                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 705                 if(!USES_LIST(mb_type,list))\
 706                     return LIST_NOT_USED;\
 707                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 708                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 709                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 710                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 711
 712                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 713             }
 714         }
 715         if(topright_ref == PART_NOT_AVAILABLE
 716            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 717            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 718             if(!MB_FIELD
 719                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 720                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 721             }
 722             if(MB_FIELD
 723                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 724                && i >= scan8[0]+8){
 725                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 726                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 727             }
 728         }
 729 #undef SET_DIAG_MV
 730     }
 731
 732     if(topright_ref != PART_NOT_AVAILABLE){
 733         *C= h->mv_cache[list][ i - 8 + part_width ];
 734         return topright_ref;
 735     }else{
 736         tprintf(s->avctx, "topright MV not available\n");
 737
 738         *C= h->mv_cache[list][ i - 8 - 1 ];
 739         return h->ref_cache[list][ i - 8 - 1 ];
 740     }
 741 }
 742
 743 /**
 744  * gets the predicted MV.
 745  * @param n the block index
 746  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 747  * @param mx the x component of the predicted motion vector
 748  * @param my the y component of the predicted motion vector
 749  */
 750 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 751     const int index8= scan8[n];
 752     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 753     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 754     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 755     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 756     const int16_t * C;
 757     int diagonal_ref, match_count;
 758
 759     assert(part_width==1 || part_width==2 || part_width==4);
 760
 761 /* mv_cache
 762   B . . A T T T T
 763   U . . L . . , .
 764   U . . L . . . .
 765   U . . L . . , .
 766   . . . L . . . .
 767 */
 768
 769     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 770     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 771     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 772     if(match_count > 1){ //most common
 773         *mx= mid_pred(A[0], B[0], C[0]);
 774         *my= mid_pred(A[1], B[1], C[1]);
 775     }else if(match_count==1){
 776         if(left_ref==ref){
 777             *mx= A[0];
 778             *my= A[1];
 779         }else if(top_ref==ref){
 780             *mx= B[0];
 781             *my= B[1];
 782         }else{
 783             *mx= C[0];
 784             *my= C[1];
 785         }
 786     }else{
 787         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 788             *mx= A[0];
 789             *my= A[1];
 790         }else{
 791             *mx= mid_pred(A[0], B[0], C[0]);
 792             *my= mid_pred(A[1], B[1], C[1]);
 793         }
 794     }
 795
 796     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 797 }
 798
 799 /**
 800  * gets the directionally predicted 16x8 MV.
 801  * @param n the block index
 802  * @param mx the x component of the predicted motion vector
 803  * @param my the y component of the predicted motion vector
 804  */
 805 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 806     if(n==0){
 807         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 808         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 809
 810         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 811
 812         if(top_ref == ref){
 813             *mx= B[0];
 814             *my= B[1];
 815             return;
 816         }
 817     }else{
 818         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 819         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 820
 821         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 822
 823         if(left_ref == ref){
 824             *mx= A[0];
 825             *my= A[1];
 826             return;
 827         }
 828     }
 829
 830     //RARE
 831     pred_motion(h, n, 4, list, ref, mx, my);
 832 }
 833
 834 /**
 835  * gets the directionally predicted 8x16 MV.
 836  * @param n the block index
 837  * @param mx the x component of the predicted motion vector
 838  * @param my the y component of the predicted motion vector
 839  */
 840 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 841     if(n==0){
 842         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 843         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 844
 845         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 846
 847         if(left_ref == ref){
 848             *mx= A[0];
 849             *my= A[1];
 850             return;
 851         }
 852     }else{
 853         const int16_t * C;
 854         int diagonal_ref;
 855
 856         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 857
 858         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 859
 860         if(diagonal_ref == ref){
 861             *mx= C[0];
 862             *my= C[1];
 863             return;
 864         }
 865     }
 866
 867     //RARE
 868     pred_motion(h, n, 2, list, ref, mx, my);
 869 }
 870
 871 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 872     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 873     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 874
 875     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 876
 877     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 878        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 879        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 880
 881         *mx = *my = 0;
 882         return;
 883     }
 884
 885     pred_motion(h, 0, 4, 0, 0, mx, my);
 886
 887     return;
 888 }
 889
 890 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 891     int poc0 = h->ref_list[0][i].poc;
 892     int td = av_clip(poc1 - poc0, -128, 127);
 893     if(td == 0 || h->ref_list[0][i].long_ref){
 894         return 256;
 895     }else{
 896         int tb = av_clip(poc - poc0, -128, 127);
 897         int tx = (16384 + (FFABS(td) >> 1)) / td;
 898         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 899     }
 900 }
 901
 902 static inline void direct_dist_scale_factor(H264Context * const h){
 903     MpegEncContext * const s = &h->s;
 904     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 905     const int poc1 = h->ref_list[1][0].poc;
 906     int i, field;
 907     for(field=0; field<2; field++){
 908         const int poc  = h->s.current_picture_ptr->field_poc[field];
 909         const int poc1 = h->ref_list[1][0].field_poc[field];
 910         for(i=0; i < 2*h->ref_count[0]; i++)
 911             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 912     }
 913
 914     for(i=0; i<h->ref_count[0]; i++){
 915         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 916     }
 917 }
 918
 919 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 920     MpegEncContext * const s = &h->s;
 921     Picture * const ref1 = &h->ref_list[1][0];
 922     int j, old_ref, rfield;
 923     int start= mbafi ? 16                      : 0;
 924     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 925     int interl= mbafi || s->picture_structure != PICT_FRAME;
 926
 927     /* bogus; fills in for missing frames */
 928     memset(map[list], 0, sizeof(map[list]));
 929
 930     for(rfield=0; rfield<2; rfield++){
 931         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 932             int poc = ref1->ref_poc[colfield][list][old_ref];
 933
 934             if     (!interl)
 935                 poc |= 3;
 936             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 937                 poc= (poc&~3) + rfield + 1;
 938
 939             for(j=start; j<end; j++){
 940                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 941                     int cur_ref= mbafi ? (j-16)^field : j;
 942                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 943                     if(rfield == field)
 944                         map[list][old_ref] = cur_ref;
 945                     break;
 946                 }
 947             }
 948         }
 949     }
 950 }
 951
 952 static inline void direct_ref_list_init(H264Context * const h){
 953     MpegEncContext * const s = &h->s;
 954     Picture * const ref1 = &h->ref_list[1][0];
 955     Picture * const cur = s->current_picture_ptr;
 956     int list, j, field;
 957     int sidx= (s->picture_structure&1)^1;
 958     int ref1sidx= (ref1->reference&1)^1;
 959
 960     for(list=0; list<2; list++){
 961         cur->ref_count[sidx][list] = h->ref_count[list];
 962         for(j=0; j<h->ref_count[list]; j++)
 963             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 964     }
 965
 966     if(s->picture_structure == PICT_FRAME){
 967         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 968         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 969     }
 970
 971     cur->mbaff= FRAME_MBAFF;
 972
 973     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 974         return;
 975
 976     for(list=0; list<2; list++){
 977         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 978         for(field=0; field<2; field++)
 979             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 980     }
 981 }
 982
 983 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 984     MpegEncContext * const s = &h->s;
 985     int b8_stride = h->b8_stride;
 986     int b4_stride = h->b_stride;
 987     int mb_xy = h->mb_xy;
 988     int mb_type_col[2];
 989     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 990     const int8_t *l1ref0, *l1ref1;
 991     const int is_b8x8 = IS_8X8(*mb_type);
 992     unsigned int sub_mb_type;
 993     int i8, i4;
 994
 995 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 996
 997     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 998         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
 999             int cur_poc = s->current_picture_ptr->poc;
1000             int *col_poc = h->ref_list[1]->field_poc;
1001             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1002             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1003             b8_stride = 0;
1004         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1005             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1006             mb_xy += s->mb_stride*fieldoff;
1007         }
1008         goto single_col;
1009     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1010         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1011             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1012             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1013             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1014             b8_stride *= 3;
1015             b4_stride *= 6;
1016             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1017             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1018                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1019                 && !is_b8x8){
1020                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1021                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1022             }else{
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1025             }
1026         }else{                                           //     AFR/FR    -> AFR/FR
1027 single_col:
1028             mb_type_col[0] =
1029             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1030             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1031                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1032                 * so we know exactly what block size to use */
1033                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1034                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1035             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1036                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1037                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1038             }else{
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1041             }
1042         }
1043     }
1044
1045     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1046     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1047     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1048     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1049     if(!b8_stride){
1050         if(s->mb_y&1){
1051             l1ref0 += h->b8_stride;
1052             l1ref1 += h->b8_stride;
1053             l1mv0  +=  2*b4_stride;
1054             l1mv1  +=  2*b4_stride;
1055         }
1056     }
1057
1058     if(h->direct_spatial_mv_pred){
1059         int ref[2];
1060         int mv[2][2];
1061         int list;
1062
1063         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1064
1065         /* ref = min(neighbors) */
1066         for(list=0; list<2; list++){
1067             int refa = h->ref_cache[list][scan8[0] - 1];
1068             int refb = h->ref_cache[list][scan8[0] - 8];
1069             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1070             if(refc == PART_NOT_AVAILABLE)
1071                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1072             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1073             if(ref[list] < 0)
1074                 ref[list] = -1;
1075         }
1076
1077         if(ref[0] < 0 && ref[1] < 0){
1078             ref[0] = ref[1] = 0;
1079             mv[0][0] = mv[0][1] =
1080             mv[1][0] = mv[1][1] = 0;
1081         }else{
1082             for(list=0; list<2; list++){
1083                 if(ref[list] >= 0)
1084                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1085                 else
1086                     mv[list][0] = mv[list][1] = 0;
1087             }
1088         }
1089
1090         if(ref[1] < 0){
1091             if(!is_b8x8)
1092                 *mb_type &= ~MB_TYPE_L1;
1093             sub_mb_type &= ~MB_TYPE_L1;
1094         }else if(ref[0] < 0){
1095             if(!is_b8x8)
1096                 *mb_type &= ~MB_TYPE_L0;
1097             sub_mb_type &= ~MB_TYPE_L0;
1098         }
1099
1100         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1101             for(i8=0; i8<4; i8++){
1102                 int x8 = i8&1;
1103                 int y8 = i8>>1;
1104                 int xy8 = x8+y8*b8_stride;
1105                 int xy4 = 3*x8+y8*b4_stride;
1106                 int a=0, b=0;
1107
1108                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1109                     continue;
1110                 h->sub_mb_type[i8] = sub_mb_type;
1111
1112                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1113                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1114                 if(!IS_INTRA(mb_type_col[y8])
1115                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1116                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1117                     if(ref[0] > 0)
1118                         a= pack16to32(mv[0][0],mv[0][1]);
1119                     if(ref[1] > 0)
1120                         b= pack16to32(mv[1][0],mv[1][1]);
1121                 }else{
1122                     a= pack16to32(mv[0][0],mv[0][1]);
1123                     b= pack16to32(mv[1][0],mv[1][1]);
1124                 }
1125                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1126                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1127             }
1128         }else if(IS_16X16(*mb_type)){
1129             int a=0, b=0;
1130
1131             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1132             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1133             if(!IS_INTRA(mb_type_col[0])
1134                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1135                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1136                        && (h->x264_build>33 || !h->x264_build)))){
1137                 if(ref[0] > 0)
1138                     a= pack16to32(mv[0][0],mv[0][1]);
1139                 if(ref[1] > 0)
1140                     b= pack16to32(mv[1][0],mv[1][1]);
1141             }else{
1142                 a= pack16to32(mv[0][0],mv[0][1]);
1143                 b= pack16to32(mv[1][0],mv[1][1]);
1144             }
1145             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1146             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1147         }else{
1148             for(i8=0; i8<4; i8++){
1149                 const int x8 = i8&1;
1150                 const int y8 = i8>>1;
1151
1152                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1153                     continue;
1154                 h->sub_mb_type[i8] = sub_mb_type;
1155
1156                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1157                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1158                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1159                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1160
1161                 /* col_zero_flag */
1162                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1163                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1164                                                   && (h->x264_build>33 || !h->x264_build)))){
1165                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1166                     if(IS_SUB_8X8(sub_mb_type)){
1167                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1168                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1169                             if(ref[0] == 0)
1170                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1171                             if(ref[1] == 0)
1172                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1173                         }
1174                     }else
1175                     for(i4=0; i4<4; i4++){
1176                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1177                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1178                             if(ref[0] == 0)
1179                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1180                             if(ref[1] == 0)
1181                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1182                         }
1183                     }
1184                 }
1185             }
1186         }
1187     }else{ /* direct temporal mv pred */
1188         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1189         const int *dist_scale_factor = h->dist_scale_factor;
1190         int ref_offset= 0;
1191
1192         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1193             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1194             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1195             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1196         }
1197         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1198             ref_offset += 16;
1199
1200         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1201             /* FIXME assumes direct_8x8_inference == 1 */
1202             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1203
1204             for(i8=0; i8<4; i8++){
1205                 const int x8 = i8&1;
1206                 const int y8 = i8>>1;
1207                 int ref0, scale;
1208                 const int16_t (*l1mv)[2]= l1mv0;
1209
1210                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1211                     continue;
1212                 h->sub_mb_type[i8] = sub_mb_type;
1213
1214                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1215                 if(IS_INTRA(mb_type_col[y8])){
1216                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1217                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1218                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1219                     continue;
1220                 }
1221
1222                 ref0 = l1ref0[x8 + y8*b8_stride];
1223                 if(ref0 >= 0)
1224                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1225                 else{
1226                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1227                     l1mv= l1mv1;
1228                 }
1229                 scale = dist_scale_factor[ref0];
1230                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1231
1232                 {
1233                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1234                     int my_col = (mv_col[1]<<y_shift)/2;
1235                     int mx = (scale * mv_col[0] + 128) >> 8;
1236                     int my = (scale * my_col + 128) >> 8;
1237                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1238                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1239                 }
1240             }
1241             return;
1242         }
1243
1244         /* one-to-one mv scaling */
1245
1246         if(IS_16X16(*mb_type)){
1247             int ref, mv0, mv1;
1248
1249             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1250             if(IS_INTRA(mb_type_col[0])){
1251                 ref=mv0=mv1=0;
1252             }else{
1253                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1254                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1255                 const int scale = dist_scale_factor[ref0];
1256                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1257                 int mv_l0[2];
1258                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1259                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1260                 ref= ref0;
1261                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1262                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1263             }
1264             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1265             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1266             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1267         }else{
1268             for(i8=0; i8<4; i8++){
1269                 const int x8 = i8&1;
1270                 const int y8 = i8>>1;
1271                 int ref0, scale;
1272                 const int16_t (*l1mv)[2]= l1mv0;
1273
1274                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1275                     continue;
1276                 h->sub_mb_type[i8] = sub_mb_type;
1277                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1278                 if(IS_INTRA(mb_type_col[0])){
1279                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1280                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1281                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1282                     continue;
1283                 }
1284
1285                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1286                 if(ref0 >= 0)
1287                     ref0 = map_col_to_list0[0][ref0];
1288                 else{
1289                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1290                     l1mv= l1mv1;
1291                 }
1292                 scale = dist_scale_factor[ref0];
1293
1294                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1295                 if(IS_SUB_8X8(sub_mb_type)){
1296                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1297                     int mx = (scale * mv_col[0] + 128) >> 8;
1298                     int my = (scale * mv_col[1] + 128) >> 8;
1299                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1300                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1301                 }else
1302                 for(i4=0; i4<4; i4++){
1303                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1304                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1305                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1306                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1307                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1308                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1309                 }
1310             }
1311         }
1312     }
1313 }
1314
1315 static inline void write_back_motion(H264Context *h, int mb_type){
1316     MpegEncContext * const s = &h->s;
1317     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1318     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1319     int list;
1320
1321     if(!USES_LIST(mb_type, 0))
1322         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1323
1324     for(list=0; list<h->list_count; list++){
1325         int y;
1326         if(!USES_LIST(mb_type, list))
1327             continue;
1328
1329         for(y=0; y<4; y++){
1330             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1331             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1332         }
1333         if( h->pps.cabac ) {
1334             if(IS_SKIP(mb_type))
1335                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1336             else
1337             for(y=0; y<4; y++){
1338                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1339                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1340             }
1341         }
1342
1343         {
1344             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1345             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1346             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1347             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1348             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1349         }
1350     }
1351
1352     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1353         if(IS_8X8(mb_type)){
1354             uint8_t *direct_table = &h->direct_table[b8_xy];
1355             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1356             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1357             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1358         }
1359     }
1360 }
1361
1362 /**
1363  * Decodes a network abstraction layer unit.
1364  * @param consumed is the number of bytes used as input
1365  * @param length is the length of the array
1366  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1367  * @returns decoded bytes, might be src+1 if no escapes
1368  */
1369 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1370     int i, si, di;
1371     uint8_t *dst;
1372     int bufidx;
1373
1374 //    src[0]&0x80;                //forbidden bit
1375     h->nal_ref_idc= src[0]>>5;
1376     h->nal_unit_type= src[0]&0x1F;
1377
1378     src++; length--;
1379 #if 0
1380     for(i=0; i<length; i++)
1381         printf("%2X ", src[i]);
1382 #endif
1383
1384 #if HAVE_FAST_UNALIGNED
1385 # if HAVE_FAST_64BIT
1386 #   define RS 7
1387     for(i=0; i+1<length; i+=9){
1388         if(!((~*(uint64_t*)(src+i) & (*(uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1389 # else
1390 #   define RS 3
1391     for(i=0; i+1<length; i+=5){
1392         if(!((~*(uint32_t*)(src+i) & (*(uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1393 # endif
1394             continue;
1395         if(i>0 && !src[i]) i--;
1396         while(src[i]) i++;
1397 #else
1398 #   define RS 0
1399     for(i=0; i+1<length; i+=2){
1400         if(src[i]) continue;
1401         if(i>0 && src[i-1]==0) i--;
1402 #endif
1403         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1404             if(src[i+2]!=3){
1405                 /* startcode, so we must be past the end */
1406                 length=i;
1407             }
1408             break;
1409         }
1410         i-= RS;
1411     }
1412
1413     if(i>=length-1){ //no escaped 0
1414         *dst_length= length;
1415         *consumed= length+1; //+1 for the header
1416         return src;
1417     }
1418
1419     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1420     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1421     dst= h->rbsp_buffer[bufidx];
1422
1423     if (dst == NULL){
1424         return NULL;
1425     }
1426
1427 //printf("decoding esc\n");
1428     memcpy(dst, src, i);
1429     si=di=i;
1430     while(si+2<length){
1431         //remove escapes (very rare 1:2^22)
1432         if(src[si+2]>3){
1433             dst[di++]= src[si++];
1434             dst[di++]= src[si++];
1435         }else if(src[si]==0 && src[si+1]==0){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 goto nsc;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447     while(si<length)
1448         dst[di++]= src[si++];
1449 nsc:
1450
1451     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1452
1453     *dst_length= di;
1454     *consumed= si + 1;//+1 for the header
1455 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1456     return dst;
1457 }
1458
1459 /**
1460  * identifies the exact end of the bitstream
1461  * @return the length of the trailing, or 0 if damaged
1462  */
1463 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1464     int v= *src;
1465     int r;
1466
1467     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1468
1469     for(r=1; r<9; r++){
1470         if(v&1) return r;
1471         v>>=1;
1472     }
1473     return 0;
1474 }
1475
1476 /**
1477  * IDCT transforms the 16 dc values and dequantizes them.
1478  * @param qp quantization parameter
1479  */
1480 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1481 #define stride 16
1482     int i;
1483     int temp[16]; //FIXME check if this is a good idea
1484     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1485     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1486
1487 //memset(block, 64, 2*256);
1488 //return;
1489     for(i=0; i<4; i++){
1490         const int offset= y_offset[i];
1491         const int z0= block[offset+stride*0] + block[offset+stride*4];
1492         const int z1= block[offset+stride*0] - block[offset+stride*4];
1493         const int z2= block[offset+stride*1] - block[offset+stride*5];
1494         const int z3= block[offset+stride*1] + block[offset+stride*5];
1495
1496         temp[4*i+0]= z0+z3;
1497         temp[4*i+1]= z1+z2;
1498         temp[4*i+2]= z1-z2;
1499         temp[4*i+3]= z0-z3;
1500     }
1501
1502     for(i=0; i<4; i++){
1503         const int offset= x_offset[i];
1504         const int z0= temp[4*0+i] + temp[4*2+i];
1505         const int z1= temp[4*0+i] - temp[4*2+i];
1506         const int z2= temp[4*1+i] - temp[4*3+i];
1507         const int z3= temp[4*1+i] + temp[4*3+i];
1508
1509         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1510         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1511         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1512         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1513     }
1514 }
1515
1516 #if 0
1517 /**
1518  * DCT transforms the 16 dc values.
1519  * @param qp quantization parameter ??? FIXME
1520  */
1521 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1522 //    const int qmul= dequant_coeff[qp][0];
1523     int i;
1524     int temp[16]; //FIXME check if this is a good idea
1525     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1526     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1527
1528     for(i=0; i<4; i++){
1529         const int offset= y_offset[i];
1530         const int z0= block[offset+stride*0] + block[offset+stride*4];
1531         const int z1= block[offset+stride*0] - block[offset+stride*4];
1532         const int z2= block[offset+stride*1] - block[offset+stride*5];
1533         const int z3= block[offset+stride*1] + block[offset+stride*5];
1534
1535         temp[4*i+0]= z0+z3;
1536         temp[4*i+1]= z1+z2;
1537         temp[4*i+2]= z1-z2;
1538         temp[4*i+3]= z0-z3;
1539     }
1540
1541     for(i=0; i<4; i++){
1542         const int offset= x_offset[i];
1543         const int z0= temp[4*0+i] + temp[4*2+i];
1544         const int z1= temp[4*0+i] - temp[4*2+i];
1545         const int z2= temp[4*1+i] - temp[4*3+i];
1546         const int z3= temp[4*1+i] + temp[4*3+i];
1547
1548         block[stride*0 +offset]= (z0 + z3)>>1;
1549         block[stride*2 +offset]= (z1 + z2)>>1;
1550         block[stride*8 +offset]= (z1 - z2)>>1;
1551         block[stride*10+offset]= (z0 - z3)>>1;
1552     }
1553 }
1554 #endif
1555
1556 #undef xStride
1557 #undef stride
1558
1559 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1560     const int stride= 16*2;
1561     const int xStride= 16;
1562     int a,b,c,d,e;
1563
1564     a= block[stride*0 + xStride*0];
1565     b= block[stride*0 + xStride*1];
1566     c= block[stride*1 + xStride*0];
1567     d= block[stride*1 + xStride*1];
1568
1569     e= a-b;
1570     a= a+b;
1571     b= c-d;
1572     c= c+d;
1573
1574     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1575     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1576     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1577     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1578 }
1579
1580 #if 0
1581 static void chroma_dc_dct_c(DCTELEM *block){
1582     const int stride= 16*2;
1583     const int xStride= 16;
1584     int a,b,c,d,e;
1585
1586     a= block[stride*0 + xStride*0];
1587     b= block[stride*0 + xStride*1];
1588     c= block[stride*1 + xStride*0];
1589     d= block[stride*1 + xStride*1];
1590
1591     e= a-b;
1592     a= a+b;
1593     b= c-d;
1594     c= c+d;
1595
1596     block[stride*0 + xStride*0]= (a+c);
1597     block[stride*0 + xStride*1]= (e+b);
1598     block[stride*1 + xStride*0]= (a-c);
1599     block[stride*1 + xStride*1]= (e-b);
1600 }
1601 #endif
1602
1603 /**
1604  * gets the chroma qp.
1605  */
1606 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1607     return h->pps.chroma_qp_table[t][qscale];
1608 }
1609
1610 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1611                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1612                            int src_x_offset, int src_y_offset,
1613                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1614     MpegEncContext * const s = &h->s;
1615     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1616     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1617     const int luma_xy= (mx&3) + ((my&3)<<2);
1618     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1619     uint8_t * src_cb, * src_cr;
1620     int extra_width= h->emu_edge_width;
1621     int extra_height= h->emu_edge_height;
1622     int emu=0;
1623     const int full_mx= mx>>2;
1624     const int full_my= my>>2;
1625     const int pic_width  = 16*s->mb_width;
1626     const int pic_height = 16*s->mb_height >> MB_FIELD;
1627
1628     if(mx&7) extra_width -= 3;
1629     if(my&7) extra_height -= 3;
1630
1631     if(   full_mx < 0-extra_width
1632        || full_my < 0-extra_height
1633        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1634        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1635         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1636             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1637         emu=1;
1638     }
1639
1640     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1641     if(!square){
1642         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1643     }
1644
1645     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1646
1647     if(MB_FIELD){
1648         // chroma offset when predicting from a field of opposite parity
1649         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1650         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1651     }
1652     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1653     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1654
1655     if(emu){
1656         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1657             src_cb= s->edge_emu_buffer;
1658     }
1659     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1660
1661     if(emu){
1662         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1663             src_cr= s->edge_emu_buffer;
1664     }
1665     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1666 }
1667
1668 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1669                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1670                            int x_offset, int y_offset,
1671                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1672                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1673                            int list0, int list1){
1674     MpegEncContext * const s = &h->s;
1675     qpel_mc_func *qpix_op=  qpix_put;
1676     h264_chroma_mc_func chroma_op= chroma_put;
1677
1678     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1679     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1680     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1681     x_offset += 8*s->mb_x;
1682     y_offset += 8*(s->mb_y >> MB_FIELD);
1683
1684     if(list0){
1685         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1686         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1687                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1688                            qpix_op, chroma_op);
1689
1690         qpix_op=  qpix_avg;
1691         chroma_op= chroma_avg;
1692     }
1693
1694     if(list1){
1695         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1696         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1697                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1698                            qpix_op, chroma_op);
1699     }
1700 }
1701
1702 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1703                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1704                            int x_offset, int y_offset,
1705                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1706                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1707                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1708                            int list0, int list1){
1709     MpegEncContext * const s = &h->s;
1710
1711     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1712     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1713     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1714     x_offset += 8*s->mb_x;
1715     y_offset += 8*(s->mb_y >> MB_FIELD);
1716
1717     if(list0 && list1){
1718         /* don't optimize for luma-only case, since B-frames usually
1719          * use implicit weights => chroma too. */
1720         uint8_t *tmp_cb = s->obmc_scratchpad;
1721         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1722         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1723         int refn0 = h->ref_cache[0][ scan8[n] ];
1724         int refn1 = h->ref_cache[1][ scan8[n] ];
1725
1726         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1727                     dest_y, dest_cb, dest_cr,
1728                     x_offset, y_offset, qpix_put, chroma_put);
1729         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1730                     tmp_y, tmp_cb, tmp_cr,
1731                     x_offset, y_offset, qpix_put, chroma_put);
1732
1733         if(h->use_weight == 2){
1734             int weight0 = h->implicit_weight[refn0][refn1];
1735             int weight1 = 64 - weight0;
1736             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1737             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1739         }else{
1740             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1742                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1743             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1745                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1746             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1747                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1748                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1749         }
1750     }else{
1751         int list = list1 ? 1 : 0;
1752         int refn = h->ref_cache[list][ scan8[n] ];
1753         Picture *ref= &h->ref_list[list][refn];
1754         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1755                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1756                     qpix_put, chroma_put);
1757
1758         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1759                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1760         if(h->use_weight_chroma){
1761             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1762                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1763             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1764                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1765         }
1766     }
1767 }
1768
1769 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1770                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1771                            int x_offset, int y_offset,
1772                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1773                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1774                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1775                            int list0, int list1){
1776     if((h->use_weight==2 && list0 && list1
1777         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1778        || h->use_weight==1)
1779         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1780                          x_offset, y_offset, qpix_put, chroma_put,
1781                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1782     else
1783         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1784                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1785 }
1786
1787 static inline void prefetch_motion(H264Context *h, int list){
1788     /* fetch pixels for estimated mv 4 macroblocks ahead
1789      * optimized for 64byte cache lines */
1790     MpegEncContext * const s = &h->s;
1791     const int refn = h->ref_cache[list][scan8[0]];
1792     if(refn >= 0){
1793         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1794         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1795         uint8_t **src= h->ref_list[list][refn].data;
1796         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1797         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1798         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1799         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1800     }
1801 }
1802
1803 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1804                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1805                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1806                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1807     MpegEncContext * const s = &h->s;
1808     const int mb_xy= h->mb_xy;
1809     const int mb_type= s->current_picture.mb_type[mb_xy];
1810
1811     assert(IS_INTER(mb_type));
1812
1813     prefetch_motion(h, 0);
1814
1815     if(IS_16X16(mb_type)){
1816         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1817                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1818                 &weight_op[0], &weight_avg[0],
1819                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1820     }else if(IS_16X8(mb_type)){
1821         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1822                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1823                 &weight_op[1], &weight_avg[1],
1824                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1825         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1826                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1827                 &weight_op[1], &weight_avg[1],
1828                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1829     }else if(IS_8X16(mb_type)){
1830         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1831                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1832                 &weight_op[2], &weight_avg[2],
1833                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1834         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1835                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1836                 &weight_op[2], &weight_avg[2],
1837                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1838     }else{
1839         int i;
1840
1841         assert(IS_8X8(mb_type));
1842
1843         for(i=0; i<4; i++){
1844             const int sub_mb_type= h->sub_mb_type[i];
1845             const int n= 4*i;
1846             int x_offset= (i&1)<<2;
1847             int y_offset= (i&2)<<1;
1848
1849             if(IS_SUB_8X8(sub_mb_type)){
1850                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1851                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1852                     &weight_op[3], &weight_avg[3],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else if(IS_SUB_8X4(sub_mb_type)){
1855                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1856                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1857                     &weight_op[4], &weight_avg[4],
1858                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1859                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1860                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1861                     &weight_op[4], &weight_avg[4],
1862                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1863             }else if(IS_SUB_4X8(sub_mb_type)){
1864                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1865                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1866                     &weight_op[5], &weight_avg[5],
1867                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1868                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1869                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1870                     &weight_op[5], &weight_avg[5],
1871                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1872             }else{
1873                 int j;
1874                 assert(IS_SUB_4X4(sub_mb_type));
1875                 for(j=0; j<4; j++){
1876                     int sub_x_offset= x_offset + 2*(j&1);
1877                     int sub_y_offset= y_offset +   (j&2);
1878                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1879                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1880                         &weight_op[6], &weight_avg[6],
1881                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1882                 }
1883             }
1884         }
1885     }
1886
1887     prefetch_motion(h, 1);
1888 }
1889
1890 static av_cold void init_cavlc_level_tab(void){
1891     int suffix_length, mask;
1892     unsigned int i;
1893
1894     for(suffix_length=0; suffix_length<7; suffix_length++){
1895         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1896             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1897             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1898
1899             mask= -(level_code&1);
1900             level_code= (((2+level_code)>>1) ^ mask) - mask;
1901             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1902                 cavlc_level_tab[suffix_length][i][0]= level_code;
1903                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1904             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1905                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1906                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1907             }else{
1908                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1909                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1910             }
1911         }
1912     }
1913 }
1914
1915 static av_cold void decode_init_vlc(void){
1916     static int done = 0;
1917
1918     if (!done) {
1919         int i;
1920         int offset;
1921         done = 1;
1922
1923         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1924         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1925         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1926                  &chroma_dc_coeff_token_len [0], 1, 1,
1927                  &chroma_dc_coeff_token_bits[0], 1, 1,
1928                  INIT_VLC_USE_NEW_STATIC);
1929
1930         offset = 0;
1931         for(i=0; i<4; i++){
1932             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1933             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1934             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1935                      &coeff_token_len [i][0], 1, 1,
1936                      &coeff_token_bits[i][0], 1, 1,
1937                      INIT_VLC_USE_NEW_STATIC);
1938             offset += coeff_token_vlc_tables_size[i];
1939         }
1940         /*
1941          * This is a one time safety check to make sure that
1942          * the packed static coeff_token_vlc table sizes
1943          * were initialized correctly.
1944          */
1945         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1946
1947         for(i=0; i<3; i++){
1948             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1949             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1950             init_vlc(&chroma_dc_total_zeros_vlc[i],
1951                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1952                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1953                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1954                      INIT_VLC_USE_NEW_STATIC);
1955         }
1956         for(i=0; i<15; i++){
1957             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1958             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1959             init_vlc(&total_zeros_vlc[i],
1960                      TOTAL_ZEROS_VLC_BITS, 16,
1961                      &total_zeros_len [i][0], 1, 1,
1962                      &total_zeros_bits[i][0], 1, 1,
1963                      INIT_VLC_USE_NEW_STATIC);
1964         }
1965
1966         for(i=0; i<6; i++){
1967             run_vlc[i].table = run_vlc_tables[i];
1968             run_vlc[i].table_allocated = run_vlc_tables_size;
1969             init_vlc(&run_vlc[i],
1970                      RUN_VLC_BITS, 7,
1971                      &run_len [i][0], 1, 1,
1972                      &run_bits[i][0], 1, 1,
1973                      INIT_VLC_USE_NEW_STATIC);
1974         }
1975         run7_vlc.table = run7_vlc_table,
1976         run7_vlc.table_allocated = run7_vlc_table_size;
1977         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1978                  &run_len [6][0], 1, 1,
1979                  &run_bits[6][0], 1, 1,
1980                  INIT_VLC_USE_NEW_STATIC);
1981
1982         init_cavlc_level_tab();
1983     }
1984 }
1985
1986 static void free_tables(H264Context *h){
1987     int i;
1988     H264Context *hx;
1989     av_freep(&h->intra4x4_pred_mode);
1990     av_freep(&h->chroma_pred_mode_table);
1991     av_freep(&h->cbp_table);
1992     av_freep(&h->mvd_table[0]);
1993     av_freep(&h->mvd_table[1]);
1994     av_freep(&h->direct_table);
1995     av_freep(&h->non_zero_count);
1996     av_freep(&h->slice_table_base);
1997     h->slice_table= NULL;
1998
1999     av_freep(&h->mb2b_xy);
2000     av_freep(&h->mb2b8_xy);
2001
2002     for(i = 0; i < h->s.avctx->thread_count; i++) {
2003         hx = h->thread_context[i];
2004         if(!hx) continue;
2005         av_freep(&hx->top_borders[1]);
2006         av_freep(&hx->top_borders[0]);
2007         av_freep(&hx->s.obmc_scratchpad);
2008     }
2009 }
2010
2011 static void init_dequant8_coeff_table(H264Context *h){
2012     int i,q,x;
2013     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2014     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2015     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2016
2017     for(i=0; i<2; i++ ){
2018         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2019             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2020             break;
2021         }
2022
2023         for(q=0; q<52; q++){
2024             int shift = div6[q];
2025             int idx = rem6[q];
2026             for(x=0; x<64; x++)
2027                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2028                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2029                     h->pps.scaling_matrix8[i][x]) << shift;
2030         }
2031     }
2032 }
2033
2034 static void init_dequant4_coeff_table(H264Context *h){
2035     int i,j,q,x;
2036     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2037     for(i=0; i<6; i++ ){
2038         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2039         for(j=0; j<i; j++){
2040             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2041                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2042                 break;
2043             }
2044         }
2045         if(j<i)
2046             continue;
2047
2048         for(q=0; q<52; q++){
2049             int shift = div6[q] + 2;
2050             int idx = rem6[q];
2051             for(x=0; x<16; x++)
2052                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2053                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2054                     h->pps.scaling_matrix4[i][x]) << shift;
2055         }
2056     }
2057 }
2058
2059 static void init_dequant_tables(H264Context *h){
2060     int i,x;
2061     init_dequant4_coeff_table(h);
2062     if(h->pps.transform_8x8_mode)
2063         init_dequant8_coeff_table(h);
2064     if(h->sps.transform_bypass){
2065         for(i=0; i<6; i++)
2066             for(x=0; x<16; x++)
2067                 h->dequant4_coeff[i][0][x] = 1<<6;
2068         if(h->pps.transform_8x8_mode)
2069             for(i=0; i<2; i++)
2070                 for(x=0; x<64; x++)
2071                     h->dequant8_coeff[i][0][x] = 1<<6;
2072     }
2073 }
2074
2075
2076 /**
2077  * allocates tables.
2078  * needs width/height
2079  */
2080 static int alloc_tables(H264Context *h){
2081     MpegEncContext * const s = &h->s;
2082     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2083     int x,y;
2084
2085     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2086
2087     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2088     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2089     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2090
2091     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2092     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2093     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2094     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2095
2096     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2097     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2098
2099     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2100     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2101     for(y=0; y<s->mb_height; y++){
2102         for(x=0; x<s->mb_width; x++){
2103             const int mb_xy= x + y*s->mb_stride;
2104             const int b_xy = 4*x + 4*y*h->b_stride;
2105             const int b8_xy= 2*x + 2*y*h->b8_stride;
2106
2107             h->mb2b_xy [mb_xy]= b_xy;
2108             h->mb2b8_xy[mb_xy]= b8_xy;
2109         }
2110     }
2111
2112     s->obmc_scratchpad = NULL;
2113
2114     if(!h->dequant4_coeff[0])
2115         init_dequant_tables(h);
2116
2117     return 0;
2118 fail:
2119     free_tables(h);
2120     return -1;
2121 }
2122
2123 /**
2124  * Mimic alloc_tables(), but for every context thread.
2125  */
2126 static void clone_tables(H264Context *dst, H264Context *src){
2127     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2128     dst->non_zero_count           = src->non_zero_count;
2129     dst->slice_table              = src->slice_table;
2130     dst->cbp_table                = src->cbp_table;
2131     dst->mb2b_xy                  = src->mb2b_xy;
2132     dst->mb2b8_xy                 = src->mb2b8_xy;
2133     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2134     dst->mvd_table[0]             = src->mvd_table[0];
2135     dst->mvd_table[1]             = src->mvd_table[1];
2136     dst->direct_table             = src->direct_table;
2137
2138     dst->s.obmc_scratchpad = NULL;
2139     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2140 }
2141
2142 /**
2143  * Init context
2144  * Allocate buffers which are not shared amongst multiple threads.
2145  */
2146 static int context_init(H264Context *h){
2147     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2148     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2149
2150     return 0;
2151 fail:
2152     return -1; // free_tables will clean up for us
2153 }
2154
2155 static av_cold void common_init(H264Context *h){
2156     MpegEncContext * const s = &h->s;
2157
2158     s->width = s->avctx->width;
2159     s->height = s->avctx->height;
2160     s->codec_id= s->avctx->codec->id;
2161
2162     ff_h264_pred_init(&h->hpc, s->codec_id);
2163
2164     h->dequant_coeff_pps= -1;
2165     s->unrestricted_mv=1;
2166     s->decode=1; //FIXME
2167
2168     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2169
2170     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2171     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2172 }
2173
2174 static av_cold int decode_init(AVCodecContext *avctx){
2175     H264Context *h= avctx->priv_data;
2176     MpegEncContext * const s = &h->s;
2177
2178     MPV_decode_defaults(s);
2179
2180     s->avctx = avctx;
2181     common_init(h);
2182
2183     s->out_format = FMT_H264;
2184     s->workaround_bugs= avctx->workaround_bugs;
2185
2186     // set defaults
2187 //    s->decode_mb= ff_h263_decode_mb;
2188     s->quarter_sample = 1;
2189     s->low_delay= 1;
2190
2191     if(avctx->codec_id == CODEC_ID_SVQ3)
2192         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2193     else if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2194         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2195     else
2196         avctx->pix_fmt= PIX_FMT_YUV420P;
2197
2198     decode_init_vlc();
2199
2200     if(avctx->extradata_size > 0 && avctx->extradata &&
2201        *(char *)avctx->extradata == 1){
2202         h->is_avc = 1;
2203         h->got_avcC = 0;
2204     } else {
2205         h->is_avc = 0;
2206     }
2207
2208     h->thread_context[0] = h;
2209     h->outputed_poc = INT_MIN;
2210     h->prev_poc_msb= 1<<16;
2211     h->sei_recovery_frame_cnt = -1;
2212     h->sei_dpb_output_delay = 0;
2213     h->sei_cpb_removal_delay = -1;
2214     return 0;
2215 }
2216
2217 static int frame_start(H264Context *h){
2218     MpegEncContext * const s = &h->s;
2219     int i;
2220
2221     if(MPV_frame_start(s, s->avctx) < 0)
2222         return -1;
2223     ff_er_frame_start(s);
2224     /*
2225      * MPV_frame_start uses pict_type to derive key_frame.
2226      * This is incorrect for H.264; IDR markings must be used.
2227      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2228      * See decode_nal_units().
2229      */
2230     s->current_picture_ptr->key_frame= 0;
2231
2232     assert(s->linesize && s->uvlinesize);
2233
2234     for(i=0; i<16; i++){
2235         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2236         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2237     }
2238     for(i=0; i<4; i++){
2239         h->block_offset[16+i]=
2240         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2241         h->block_offset[24+16+i]=
2242         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243     }
2244
2245     /* can't be in alloc_tables because linesize isn't known there.
2246      * FIXME: redo bipred weight to not require extra buffer? */
2247     for(i = 0; i < s->avctx->thread_count; i++)
2248         if(!h->thread_context[i]->s.obmc_scratchpad)
2249             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2250
2251     /* some macroblocks will be accessed before they're available */
2252     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2253         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2254
2255 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2256
2257     // We mark the current picture as non-reference after allocating it, so
2258     // that if we break out due to an error it can be released automatically
2259     // in the next MPV_frame_start().
2260     // SVQ3 as well as most other codecs have only last/next/current and thus
2261     // get released even with set reference, besides SVQ3 and others do not
2262     // mark frames as reference later "naturally".
2263     if(s->codec_id != CODEC_ID_SVQ3)
2264         s->current_picture_ptr->reference= 0;
2265
2266     s->current_picture_ptr->field_poc[0]=
2267     s->current_picture_ptr->field_poc[1]= INT_MAX;
2268     assert(s->current_picture_ptr->long_ref==0);
2269
2270     return 0;
2271 }
2272
2273 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2274     MpegEncContext * const s = &h->s;
2275     int i;
2276     int step    = 1;
2277     int offset  = 1;
2278     int uvoffset= 1;
2279     int top_idx = 1;
2280     int skiplast= 0;
2281
2282     src_y  -=   linesize;
2283     src_cb -= uvlinesize;
2284     src_cr -= uvlinesize;
2285
2286     if(!simple && FRAME_MBAFF){
2287         if(s->mb_y&1){
2288             offset  = MB_MBAFF ? 1 : 17;
2289             uvoffset= MB_MBAFF ? 1 : 9;
2290             if(!MB_MBAFF){
2291                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2292                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2293                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2294                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2295                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2296                 }
2297             }
2298         }else{
2299             if(!MB_MBAFF){
2300                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2301                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2302                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2303                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2304                 }
2305                 skiplast= 1;
2306             }
2307             offset  =
2308             uvoffset=
2309             top_idx = MB_MBAFF ? 0 : 1;
2310         }
2311         step= MB_MBAFF ? 2 : 1;
2312     }
2313
2314     // There are two lines saved, the line above the the top macroblock of a pair,
2315     // and the line above the bottom macroblock
2316     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2317     for(i=1; i<17 - skiplast; i++){
2318         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2319     }
2320
2321     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2322     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2323
2324     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2325         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2326         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2327         for(i=1; i<9 - skiplast; i++){
2328             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2329             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2330         }
2331         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2332         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2333     }
2334 }
2335
2336 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2337     MpegEncContext * const s = &h->s;
2338     int temp8, i;
2339     uint64_t temp64;
2340     int deblock_left;
2341     int deblock_top;
2342     int mb_xy;
2343     int step    = 1;
2344     int offset  = 1;
2345     int uvoffset= 1;
2346     int top_idx = 1;
2347
2348     if(!simple && FRAME_MBAFF){
2349         if(s->mb_y&1){
2350             offset  = MB_MBAFF ? 1 : 17;
2351             uvoffset= MB_MBAFF ? 1 : 9;
2352         }else{
2353             offset  =
2354             uvoffset=
2355             top_idx = MB_MBAFF ? 0 : 1;
2356         }
2357         step= MB_MBAFF ? 2 : 1;
2358     }
2359
2360     if(h->deblocking_filter == 2) {
2361         mb_xy = h->mb_xy;
2362         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2363         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2364     } else {
2365         deblock_left = (s->mb_x > 0);
2366         deblock_top =  (s->mb_y > !!MB_FIELD);
2367     }
2368
2369     src_y  -=   linesize + 1;
2370     src_cb -= uvlinesize + 1;
2371     src_cr -= uvlinesize + 1;
2372
2373 #define XCHG(a,b,t,xchg)\
2374 t= a;\
2375 if(xchg)\
2376     a= b;\
2377 b= t;
2378
2379     if(deblock_left){
2380         for(i = !deblock_top; i<16; i++){
2381             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2382         }
2383         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2384     }
2385
2386     if(deblock_top){
2387         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2388         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2389         if(s->mb_x+1 < s->mb_width){
2390             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2391         }
2392     }
2393
2394     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2395         if(deblock_left){
2396             for(i = !deblock_top; i<8; i++){
2397                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2398                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2399             }
2400             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2401             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2402         }
2403         if(deblock_top){
2404             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2405             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2406         }
2407     }
2408 }
2409
2410 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2411     MpegEncContext * const s = &h->s;
2412     const int mb_x= s->mb_x;
2413     const int mb_y= s->mb_y;
2414     const int mb_xy= h->mb_xy;
2415     const int mb_type= s->current_picture.mb_type[mb_xy];
2416     uint8_t  *dest_y, *dest_cb, *dest_cr;
2417     int linesize, uvlinesize /*dct_offset*/;
2418     int i;
2419     int *block_offset = &h->block_offset[0];
2420     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2421     /* is_h264 should always be true if SVQ3 is disabled. */
2422     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2423     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2424     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2425
2426     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2427     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2428     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2429
2430     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2431     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2432
2433     if (!simple && MB_FIELD) {
2434         linesize   = h->mb_linesize   = s->linesize * 2;
2435         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2436         block_offset = &h->block_offset[24];
2437         if(mb_y&1){ //FIXME move out of this function?
2438             dest_y -= s->linesize*15;
2439             dest_cb-= s->uvlinesize*7;
2440             dest_cr-= s->uvlinesize*7;
2441         }
2442         if(FRAME_MBAFF) {
2443             int list;
2444             for(list=0; list<h->list_count; list++){
2445                 if(!USES_LIST(mb_type, list))
2446                     continue;
2447                 if(IS_16X16(mb_type)){
2448                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2449                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2450                 }else{
2451                     for(i=0; i<16; i+=4){
2452                         int ref = h->ref_cache[list][scan8[i]];
2453                         if(ref >= 0)
2454                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2455                     }
2456                 }
2457             }
2458         }
2459     } else {
2460         linesize   = h->mb_linesize   = s->linesize;
2461         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2462 //        dct_offset = s->linesize * 16;
2463     }
2464
2465     if (!simple && IS_INTRA_PCM(mb_type)) {
2466         for (i=0; i<16; i++) {
2467             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2468         }
2469         for (i=0; i<8; i++) {
2470             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2471             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2472         }
2473     } else {
2474         if(IS_INTRA(mb_type)){
2475             if(h->deblocking_filter)
2476                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2477
2478             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2479                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2480                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2481             }
2482
2483             if(IS_INTRA4x4(mb_type)){
2484                 if(simple || !s->encoding){
2485                     if(IS_8x8DCT(mb_type)){
2486                         if(transform_bypass){
2487                             idct_dc_add =
2488                             idct_add    = s->dsp.add_pixels8;
2489                         }else{
2490                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2491                             idct_add    = s->dsp.h264_idct8_add;
2492                         }
2493                         for(i=0; i<16; i+=4){
2494                             uint8_t * const ptr= dest_y + block_offset[i];
2495                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2496                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2497                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2498                             }else{
2499                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2500                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2501                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2502                                 if(nnz){
2503                                     if(nnz == 1 && h->mb[i*16])
2504                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2505                                     else
2506                                         idct_add   (ptr, h->mb + i*16, linesize);
2507                                 }
2508                             }
2509                         }
2510                     }else{
2511                         if(transform_bypass){
2512                             idct_dc_add =
2513                             idct_add    = s->dsp.add_pixels4;
2514                         }else{
2515                             idct_dc_add = s->dsp.h264_idct_dc_add;
2516                             idct_add    = s->dsp.h264_idct_add;
2517                         }
2518                         for(i=0; i<16; i++){
2519                             uint8_t * const ptr= dest_y + block_offset[i];
2520                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2521
2522                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2523                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2524                             }else{
2525                                 uint8_t *topright;
2526                                 int nnz, tr;
2527                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2528                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2529                                     assert(mb_y || linesize <= block_offset[i]);
2530                                     if(!topright_avail){
2531                                         tr= ptr[3 - linesize]*0x01010101;
2532                                         topright= (uint8_t*) &tr;
2533                                     }else
2534                                         topright= ptr + 4 - linesize;
2535                                 }else
2536                                     topright= NULL;
2537
2538                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2539                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2540                                 if(nnz){
2541                                     if(is_h264){
2542                                         if(nnz == 1 && h->mb[i*16])
2543                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2544                                         else
2545                                             idct_add   (ptr, h->mb + i*16, linesize);
2546                                     }else
2547                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2548                                 }
2549                             }
2550                         }
2551                     }
2552                 }
2553             }else{
2554                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2555                 if(is_h264){
2556                     if(!transform_bypass)
2557                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2558                 }else
2559                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2560             }
2561             if(h->deblocking_filter)
2562                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2563         }else if(is_h264){
2564             hl_motion(h, dest_y, dest_cb, dest_cr,
2565                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2566                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2567                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2568         }
2569
2570
2571         if(!IS_INTRA4x4(mb_type)){
2572             if(is_h264){
2573                 if(IS_INTRA16x16(mb_type)){
2574                     if(transform_bypass){
2575                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2576                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2577                         }else{
2578                             for(i=0; i<16; i++){
2579                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2580                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2581                             }
2582                         }
2583                     }else{
2584                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2585                     }
2586                 }else if(h->cbp&15){
2587                     if(transform_bypass){
2588                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2589                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2590                         for(i=0; i<16; i+=di){
2591                             if(h->non_zero_count_cache[ scan8[i] ]){
2592                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2593                             }
2594                         }
2595                     }else{
2596                         if(IS_8x8DCT(mb_type)){
2597                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2598                         }else{
2599                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2600                         }
2601                     }
2602                 }
2603             }else{
2604                 for(i=0; i<16; i++){
2605                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2606                         uint8_t * const ptr= dest_y + block_offset[i];
2607                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2608                     }
2609                 }
2610             }
2611         }
2612
2613         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2614             uint8_t *dest[2] = {dest_cb, dest_cr};
2615             if(transform_bypass){
2616                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2617                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2618                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2619                 }else{
2620                     idct_add = s->dsp.add_pixels4;
2621                     for(i=16; i<16+8; i++){
2622                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2623                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2624                     }
2625                 }
2626             }else{
2627                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2628                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2629                 if(is_h264){
2630                     idct_add = s->dsp.h264_idct_add;
2631                     idct_dc_add = s->dsp.h264_idct_dc_add;
2632                     for(i=16; i<16+8; i++){
2633                         if(h->non_zero_count_cache[ scan8[i] ])
2634                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2635                         else if(h->mb[i*16])
2636                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2637                     }
2638                 }else{
2639                     for(i=16; i<16+8; i++){
2640                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2641                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2642                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2643                         }
2644                     }
2645                 }
2646             }
2647         }
2648     }
2649     if(h->cbp || IS_INTRA(mb_type))
2650         s->dsp.clear_blocks(h->mb);
2651
2652     if(h->deblocking_filter) {
2653         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2654         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2655         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2656         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2657         if (!simple && FRAME_MBAFF) {
2658             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2659         } else {
2660             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2661         }
2662     }
2663 }
2664
2665 /**
2666  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2667  */
2668 static void hl_decode_mb_simple(H264Context *h){
2669     hl_decode_mb_internal(h, 1);
2670 }
2671
2672 /**
2673  * Process a macroblock; this handles edge cases, such as interlacing.
2674  */
2675 static void av_noinline hl_decode_mb_complex(H264Context *h){
2676     hl_decode_mb_internal(h, 0);
2677 }
2678
2679 static void hl_decode_mb(H264Context *h){
2680     MpegEncContext * const s = &h->s;
2681     const int mb_xy= h->mb_xy;
2682     const int mb_type= s->current_picture.mb_type[mb_xy];
2683     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2684
2685     if (is_complex)
2686         hl_decode_mb_complex(h);
2687     else hl_decode_mb_simple(h);
2688 }
2689
2690 static void pic_as_field(Picture *pic, const int parity){
2691     int i;
2692     for (i = 0; i < 4; ++i) {
2693         if (parity == PICT_BOTTOM_FIELD)
2694             pic->data[i] += pic->linesize[i];
2695         pic->reference = parity;
2696         pic->linesize[i] *= 2;
2697     }
2698     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2699 }
2700
2701 static int split_field_copy(Picture *dest, Picture *src,
2702                             int parity, int id_add){
2703     int match = !!(src->reference & parity);
2704
2705     if (match) {
2706         *dest = *src;
2707         if(parity != PICT_FRAME){
2708             pic_as_field(dest, parity);
2709             dest->pic_id *= 2;
2710             dest->pic_id += id_add;
2711         }
2712     }
2713
2714     return match;
2715 }
2716
2717 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2718     int i[2]={0};
2719     int index=0;
2720
2721     while(i[0]<len || i[1]<len){
2722         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2723             i[0]++;
2724         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2725             i[1]++;
2726         if(i[0] < len){
2727             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2728             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2729         }
2730         if(i[1] < len){
2731             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2732             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2733         }
2734     }
2735
2736     return index;
2737 }
2738
2739 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2740     int i, best_poc;
2741     int out_i= 0;
2742
2743     for(;;){
2744         best_poc= dir ? INT_MIN : INT_MAX;
2745
2746         for(i=0; i<len; i++){
2747             const int poc= src[i]->poc;
2748             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2749                 best_poc= poc;
2750                 sorted[out_i]= src[i];
2751             }
2752         }
2753         if(best_poc == (dir ? INT_MIN : INT_MAX))
2754             break;
2755         limit= sorted[out_i++]->poc - dir;
2756     }
2757     return out_i;
2758 }
2759
2760 /**
2761  * fills the default_ref_list.
2762  */
2763 static int fill_default_ref_list(H264Context *h){
2764     MpegEncContext * const s = &h->s;
2765     int i, len;
2766
2767     if(h->slice_type_nos==FF_B_TYPE){
2768         Picture *sorted[32];
2769         int cur_poc, list;
2770         int lens[2];
2771
2772         if(FIELD_PICTURE)
2773             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2774         else
2775             cur_poc= s->current_picture_ptr->poc;
2776
2777         for(list= 0; list<2; list++){
2778             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2779             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2780             assert(len<=32);
2781             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2782             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2783             assert(len<=32);
2784
2785             if(len < h->ref_count[list])
2786                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2787             lens[list]= len;
2788         }
2789
2790         if(lens[0] == lens[1] && lens[1] > 1){
2791             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2792             if(i == lens[0])
2793                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2794         }
2795     }else{
2796         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2797         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2798         assert(len <= 32);
2799         if(len < h->ref_count[0])
2800             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2801     }
2802 #ifdef TRACE
2803     for (i=0; i<h->ref_count[0]; i++) {
2804         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2805     }
2806     if(h->slice_type_nos==FF_B_TYPE){
2807         for (i=0; i<h->ref_count[1]; i++) {
2808             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2809         }
2810     }
2811 #endif
2812     return 0;
2813 }
2814
2815 static void print_short_term(H264Context *h);
2816 static void print_long_term(H264Context *h);
2817
2818 /**
2819  * Extract structure information about the picture described by pic_num in
2820  * the current decoding context (frame or field). Note that pic_num is
2821  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2822  * @param pic_num picture number for which to extract structure information
2823  * @param structure one of PICT_XXX describing structure of picture
2824  *                      with pic_num
2825  * @return frame number (short term) or long term index of picture
2826  *         described by pic_num
2827  */
2828 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2829     MpegEncContext * const s = &h->s;
2830
2831     *structure = s->picture_structure;
2832     if(FIELD_PICTURE){
2833         if (!(pic_num & 1))
2834             /* opposite field */
2835             *structure ^= PICT_FRAME;
2836         pic_num >>= 1;
2837     }
2838
2839     return pic_num;
2840 }
2841
2842 static int decode_ref_pic_list_reordering(H264Context *h){
2843     MpegEncContext * const s = &h->s;
2844     int list, index, pic_structure;
2845
2846     print_short_term(h);
2847     print_long_term(h);
2848
2849     for(list=0; list<h->list_count; list++){
2850         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2851
2852         if(get_bits1(&s->gb)){
2853             int pred= h->curr_pic_num;
2854
2855             for(index=0; ; index++){
2856                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2857                 unsigned int pic_id;
2858                 int i;
2859                 Picture *ref = NULL;
2860
2861                 if(reordering_of_pic_nums_idc==3)
2862                     break;
2863
2864                 if(index >= h->ref_count[list]){
2865                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2866                     return -1;
2867                 }
2868
2869                 if(reordering_of_pic_nums_idc<3){
2870                     if(reordering_of_pic_nums_idc<2){
2871                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2872                         int frame_num;
2873
2874                         if(abs_diff_pic_num > h->max_pic_num){
2875                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2876                             return -1;
2877                         }
2878
2879                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2880                         else                                pred+= abs_diff_pic_num;
2881                         pred &= h->max_pic_num - 1;
2882
2883                         frame_num = pic_num_extract(h, pred, &pic_structure);
2884
2885                         for(i= h->short_ref_count-1; i>=0; i--){
2886                             ref = h->short_ref[i];
2887                             assert(ref->reference);
2888                             assert(!ref->long_ref);
2889                             if(
2890                                    ref->frame_num == frame_num &&
2891                                    (ref->reference & pic_structure)
2892                               )
2893                                 break;
2894                         }
2895                         if(i>=0)
2896                             ref->pic_id= pred;
2897                     }else{
2898                         int long_idx;
2899                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2900
2901                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2902
2903                         if(long_idx>31){
2904                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2905                             return -1;
2906                         }
2907                         ref = h->long_ref[long_idx];
2908                         assert(!(ref && !ref->reference));
2909                         if(ref && (ref->reference & pic_structure)){
2910                             ref->pic_id= pic_id;
2911                             assert(ref->long_ref);
2912                             i=0;
2913                         }else{
2914                             i=-1;
2915                         }
2916                     }
2917
2918                     if (i < 0) {
2919                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2920                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2921                     } else {
2922                         for(i=index; i+1<h->ref_count[list]; i++){
2923                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2924                                 break;
2925                         }
2926                         for(; i > index; i--){
2927                             h->ref_list[list][i]= h->ref_list[list][i-1];
2928                         }
2929                         h->ref_list[list][index]= *ref;
2930                         if (FIELD_PICTURE){
2931                             pic_as_field(&h->ref_list[list][index], pic_structure);
2932                         }
2933                     }
2934                 }else{
2935                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2936                     return -1;
2937                 }
2938             }
2939         }
2940     }
2941     for(list=0; list<h->list_count; list++){
2942         for(index= 0; index < h->ref_count[list]; index++){
2943             if(!h->ref_list[list][index].data[0]){
2944                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2945                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2946             }
2947         }
2948     }
2949
2950     return 0;
2951 }
2952
2953 static void fill_mbaff_ref_list(H264Context *h){
2954     int list, i, j;
2955     for(list=0; list<2; list++){ //FIXME try list_count
2956         for(i=0; i<h->ref_count[list]; i++){
2957             Picture *frame = &h->ref_list[list][i];
2958             Picture *field = &h->ref_list[list][16+2*i];
2959             field[0] = *frame;
2960             for(j=0; j<3; j++)
2961                 field[0].linesize[j] <<= 1;
2962             field[0].reference = PICT_TOP_FIELD;
2963             field[0].poc= field[0].field_poc[0];
2964             field[1] = field[0];
2965             for(j=0; j<3; j++)
2966                 field[1].data[j] += frame->linesize[j];
2967             field[1].reference = PICT_BOTTOM_FIELD;
2968             field[1].poc= field[1].field_poc[1];
2969
2970             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2971             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2972             for(j=0; j<2; j++){
2973                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2974                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2975             }
2976         }
2977     }
2978     for(j=0; j<h->ref_count[1]; j++){
2979         for(i=0; i<h->ref_count[0]; i++)
2980             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2981         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2982         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2983     }
2984 }
2985
2986 static int pred_weight_table(H264Context *h){
2987     MpegEncContext * const s = &h->s;
2988     int list, i;
2989     int luma_def, chroma_def;
2990
2991     h->use_weight= 0;
2992     h->use_weight_chroma= 0;
2993     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2994     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2995     luma_def = 1<<h->luma_log2_weight_denom;
2996     chroma_def = 1<<h->chroma_log2_weight_denom;
2997
2998     for(list=0; list<2; list++){
2999         h->luma_weight_flag[list]   = 0;
3000         h->chroma_weight_flag[list] = 0;
3001         for(i=0; i<h->ref_count[list]; i++){
3002             int luma_weight_flag, chroma_weight_flag;
3003
3004             luma_weight_flag= get_bits1(&s->gb);
3005             if(luma_weight_flag){
3006                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3007                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3008                 if(   h->luma_weight[list][i] != luma_def
3009                    || h->luma_offset[list][i] != 0) {
3010                     h->use_weight= 1;
3011                     h->luma_weight_flag[list]= 1;
3012                 }
3013             }else{
3014                 h->luma_weight[list][i]= luma_def;
3015                 h->luma_offset[list][i]= 0;
3016             }
3017
3018             if(CHROMA){
3019                 chroma_weight_flag= get_bits1(&s->gb);
3020                 if(chroma_weight_flag){
3021                     int j;
3022                     for(j=0; j<2; j++){
3023                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3024                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3025                         if(   h->chroma_weight[list][i][j] != chroma_def
3026                            || h->chroma_offset[list][i][j] != 0) {
3027                             h->use_weight_chroma= 1;
3028                             h->chroma_weight_flag[list]= 1;
3029                         }
3030                     }
3031                 }else{
3032                     int j;
3033                     for(j=0; j<2; j++){
3034                         h->chroma_weight[list][i][j]= chroma_def;
3035                         h->chroma_offset[list][i][j]= 0;
3036                     }
3037                 }
3038             }
3039         }
3040         if(h->slice_type_nos != FF_B_TYPE) break;
3041     }
3042     h->use_weight= h->use_weight || h->use_weight_chroma;
3043     return 0;
3044 }
3045
3046 static void implicit_weight_table(H264Context *h){
3047     MpegEncContext * const s = &h->s;
3048     int ref0, ref1, i;
3049     int cur_poc = s->current_picture_ptr->poc;
3050
3051     for (i = 0; i < 2; i++) {
3052         h->luma_weight_flag[i]   = 0;
3053         h->chroma_weight_flag[i] = 0;
3054     }
3055
3056     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3057        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3058         h->use_weight= 0;
3059         h->use_weight_chroma= 0;
3060         return;
3061     }
3062
3063     h->use_weight= 2;
3064     h->use_weight_chroma= 2;
3065     h->luma_log2_weight_denom= 5;
3066     h->chroma_log2_weight_denom= 5;
3067
3068     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3069         int poc0 = h->ref_list[0][ref0].poc;
3070         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3071             int poc1 = h->ref_list[1][ref1].poc;
3072             int td = av_clip(poc1 - poc0, -128, 127);
3073             if(td){
3074                 int tb = av_clip(cur_poc - poc0, -128, 127);
3075                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3076                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3077                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3078                     h->implicit_weight[ref0][ref1] = 32;
3079                 else
3080                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3081             }else
3082                 h->implicit_weight[ref0][ref1] = 32;
3083         }
3084     }
3085 }
3086
3087 /**
3088  * Mark a picture as no longer needed for reference. The refmask
3089  * argument allows unreferencing of individual fields or the whole frame.
3090  * If the picture becomes entirely unreferenced, but is being held for
3091  * display purposes, it is marked as such.
3092  * @param refmask mask of fields to unreference; the mask is bitwise
3093  *                anded with the reference marking of pic
3094  * @return non-zero if pic becomes entirely unreferenced (except possibly
3095  *         for display purposes) zero if one of the fields remains in
3096  *         reference
3097  */
3098 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3099     int i;
3100     if (pic->reference &= refmask) {
3101         return 0;
3102     } else {
3103         for(i = 0; h->delayed_pic[i]; i++)
3104             if(pic == h->delayed_pic[i]){
3105                 pic->reference=DELAYED_PIC_REF;
3106                 break;
3107             }
3108         return 1;
3109     }
3110 }
3111
3112 /**
3113  * instantaneous decoder refresh.
3114  */
3115 static void idr(H264Context *h){
3116     int i;
3117
3118     for(i=0; i<16; i++){
3119         remove_long(h, i, 0);
3120     }
3121     assert(h->long_ref_count==0);
3122
3123     for(i=0; i<h->short_ref_count; i++){
3124         unreference_pic(h, h->short_ref[i], 0);
3125         h->short_ref[i]= NULL;
3126     }
3127     h->short_ref_count=0;
3128     h->prev_frame_num= 0;
3129     h->prev_frame_num_offset= 0;
3130     h->prev_poc_msb=
3131     h->prev_poc_lsb= 0;
3132 }
3133
3134 /* forget old pics after a seek */
3135 static void flush_dpb(AVCodecContext *avctx){
3136     H264Context *h= avctx->priv_data;
3137     int i;
3138     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3139         if(h->delayed_pic[i])
3140             h->delayed_pic[i]->reference= 0;
3141         h->delayed_pic[i]= NULL;
3142     }
3143     h->outputed_poc= INT_MIN;
3144     idr(h);
3145     if(h->s.current_picture_ptr)
3146         h->s.current_picture_ptr->reference= 0;
3147     h->s.first_field= 0;
3148     h->sei_recovery_frame_cnt = -1;
3149     h->sei_dpb_output_delay = 0;
3150     h->sei_cpb_removal_delay = -1;
3151     ff_mpeg_flush(avctx);
3152 }
3153
3154 /**
3155  * Find a Picture in the short term reference list by frame number.
3156  * @param frame_num frame number to search for
3157  * @param idx the index into h->short_ref where returned picture is found
3158  *            undefined if no picture found.
3159  * @return pointer to the found picture, or NULL if no pic with the provided
3160  *                 frame number is found
3161  */
3162 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3163     MpegEncContext * const s = &h->s;
3164     int i;
3165
3166     for(i=0; i<h->short_ref_count; i++){
3167         Picture *pic= h->short_ref[i];
3168         if(s->avctx->debug&FF_DEBUG_MMCO)
3169             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3170         if(pic->frame_num == frame_num) {
3171             *idx = i;
3172             return pic;
3173         }
3174     }
3175     return NULL;
3176 }
3177
3178 /**
3179  * Remove a picture from the short term reference list by its index in
3180  * that list.  This does no checking on the provided index; it is assumed
3181  * to be valid. Other list entries are shifted down.
3182  * @param i index into h->short_ref of picture to remove.
3183  */
3184 static void remove_short_at_index(H264Context *h, int i){
3185     assert(i >= 0 && i < h->short_ref_count);
3186     h->short_ref[i]= NULL;
3187     if (--h->short_ref_count)
3188         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3189 }
3190
3191 /**
3192  *
3193  * @return the removed picture or NULL if an error occurs
3194  */
3195 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3196     MpegEncContext * const s = &h->s;
3197     Picture *pic;
3198     int i;
3199
3200     if(s->avctx->debug&FF_DEBUG_MMCO)
3201         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3202
3203     pic = find_short(h, frame_num, &i);
3204     if (pic){
3205         if(unreference_pic(h, pic, ref_mask))
3206         remove_short_at_index(h, i);
3207     }
3208
3209     return pic;
3210 }
3211
3212 /**
3213  * Remove a picture from the long term reference list by its index in
3214  * that list.
3215  * @return the removed picture or NULL if an error occurs
3216  */
3217 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3218     Picture *pic;
3219
3220     pic= h->long_ref[i];
3221     if (pic){
3222         if(unreference_pic(h, pic, ref_mask)){
3223             assert(h->long_ref[i]->long_ref == 1);
3224             h->long_ref[i]->long_ref= 0;
3225             h->long_ref[i]= NULL;
3226             h->long_ref_count--;
3227         }
3228     }
3229
3230     return pic;
3231 }
3232
3233 /**
3234  * print short term list
3235  */
3236 static void print_short_term(H264Context *h) {
3237     uint32_t i;
3238     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3239         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3240         for(i=0; i<h->short_ref_count; i++){
3241             Picture *pic= h->short_ref[i];
3242             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3243         }
3244     }
3245 }
3246
3247 /**
3248  * print long term list
3249  */
3250 static void print_long_term(H264Context *h) {
3251     uint32_t i;
3252     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3253         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3254         for(i = 0; i < 16; i++){
3255             Picture *pic= h->long_ref[i];
3256             if (pic) {
3257                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3258             }
3259         }
3260     }
3261 }
3262
3263 /**
3264  * Executes the reference picture marking (memory management control operations).
3265  */
3266 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3267     MpegEncContext * const s = &h->s;
3268     int i, j;
3269     int current_ref_assigned=0;
3270     Picture *pic;
3271
3272     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3273         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3274
3275     for(i=0; i<mmco_count; i++){
3276         int structure, frame_num;
3277         if(s->avctx->debug&FF_DEBUG_MMCO)
3278             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3279
3280         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3281            || mmco[i].opcode == MMCO_SHORT2LONG){
3282             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3283             pic = find_short(h, frame_num, &j);
3284             if(!pic){
3285                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3286                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3287                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3288                 continue;
3289             }
3290         }
3291
3292         switch(mmco[i].opcode){
3293         case MMCO_SHORT2UNUSED:
3294             if(s->avctx->debug&FF_DEBUG_MMCO)
3295                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3296             remove_short(h, frame_num, structure ^ PICT_FRAME);
3297             break;
3298         case MMCO_SHORT2LONG:
3299                 if (h->long_ref[mmco[i].long_arg] != pic)
3300                     remove_long(h, mmco[i].long_arg, 0);
3301
3302                 remove_short_at_index(h, j);
3303                 h->long_ref[ mmco[i].long_arg ]= pic;
3304                 if (h->long_ref[ mmco[i].long_arg ]){
3305                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3306                     h->long_ref_count++;
3307                 }
3308             break;
3309         case MMCO_LONG2UNUSED:
3310             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3311             pic = h->long_ref[j];
3312             if (pic) {
3313                 remove_long(h, j, structure ^ PICT_FRAME);
3314             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3315                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3316             break;
3317         case MMCO_LONG:
3318                     // Comment below left from previous code as it is an interresting note.
3319                     /* First field in pair is in short term list or
3320                      * at a different long term index.
3321                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3322                      * Report the problem and keep the pair where it is,
3323                      * and mark this field valid.
3324                      */
3325
3326             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3327                 remove_long(h, mmco[i].long_arg, 0);
3328
3329                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3330                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3331                 h->long_ref_count++;
3332             }
3333
3334             s->current_picture_ptr->reference |= s->picture_structure;
3335             current_ref_assigned=1;
3336             break;
3337         case MMCO_SET_MAX_LONG:
3338             assert(mmco[i].long_arg <= 16);
3339             // just remove the long term which index is greater than new max
3340             for(j = mmco[i].long_arg; j<16; j++){
3341                 remove_long(h, j, 0);
3342             }
3343             break;
3344         case MMCO_RESET:
3345             while(h->short_ref_count){
3346                 remove_short(h, h->short_ref[0]->frame_num, 0);
3347             }
3348             for(j = 0; j < 16; j++) {
3349                 remove_long(h, j, 0);
3350             }
3351             s->current_picture_ptr->poc=
3352             s->current_picture_ptr->field_poc[0]=
3353             s->current_picture_ptr->field_poc[1]=
3354             h->poc_lsb=
3355             h->poc_msb=
3356             h->frame_num=
3357             s->current_picture_ptr->frame_num= 0;
3358             break;
3359         default: assert(0);
3360         }
3361     }
3362
3363     if (!current_ref_assigned) {
3364         /* Second field of complementary field pair; the first field of
3365          * which is already referenced. If short referenced, it
3366          * should be first entry in short_ref. If not, it must exist
3367          * in long_ref; trying to put it on the short list here is an
3368          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3369          */
3370         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3371             /* Just mark the second field valid */
3372             s->current_picture_ptr->reference = PICT_FRAME;
3373         } else if (s->current_picture_ptr->long_ref) {
3374             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3375                                              "assignment for second field "
3376                                              "in complementary field pair "
3377                                              "(first field is long term)\n");
3378         } else {
3379             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3380             if(pic){
3381                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3382             }
3383
3384             if(h->short_ref_count)
3385                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3386
3387             h->short_ref[0]= s->current_picture_ptr;
3388             h->short_ref_count++;
3389             s->current_picture_ptr->reference |= s->picture_structure;
3390         }
3391     }
3392
3393     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3394
3395         /* We have too many reference frames, probably due to corrupted
3396          * stream. Need to discard one frame. Prevents overrun of the
3397          * short_ref and long_ref buffers.
3398          */
3399         av_log(h->s.avctx, AV_LOG_ERROR,
3400                "number of reference frames exceeds max (probably "
3401                "corrupt input), discarding one\n");
3402
3403         if (h->long_ref_count && !h->short_ref_count) {
3404             for (i = 0; i < 16; ++i)
3405                 if (h->long_ref[i])
3406                     break;
3407
3408             assert(i < 16);
3409             remove_long(h, i, 0);
3410         } else {
3411             pic = h->short_ref[h->short_ref_count - 1];
3412             remove_short(h, pic->frame_num, 0);
3413         }
3414     }
3415
3416     print_short_term(h);
3417     print_long_term(h);
3418     return 0;
3419 }
3420
3421 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3422     MpegEncContext * const s = &h->s;
3423     int i;
3424
3425     h->mmco_index= 0;
3426     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3427         s->broken_link= get_bits1(gb) -1;
3428         if(get_bits1(gb)){
3429             h->mmco[0].opcode= MMCO_LONG;
3430             h->mmco[0].long_arg= 0;
3431             h->mmco_index= 1;
3432         }
3433     }else{
3434         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3435             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3436                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3437
3438                 h->mmco[i].opcode= opcode;
3439                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3440                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3441 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3442                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3443                         return -1;
3444                     }*/
3445                 }
3446                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3447                     unsigned int long_arg= get_ue_golomb_31(gb);
3448                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3449                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3450                         return -1;
3451                     }
3452                     h->mmco[i].long_arg= long_arg;
3453                 }
3454
3455                 if(opcode > (unsigned)MMCO_LONG){
3456                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3457                     return -1;
3458                 }
3459                 if(opcode == MMCO_END)
3460                     break;
3461             }
3462             h->mmco_index= i;
3463         }else{
3464             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3465
3466             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3467                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3468                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3469                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3470                 h->mmco_index= 1;
3471                 if (FIELD_PICTURE) {
3472                     h->mmco[0].short_pic_num *= 2;
3473                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3474                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3475                     h->mmco_index= 2;
3476                 }
3477             }
3478         }
3479     }
3480
3481     return 0;
3482 }
3483
3484 static int init_poc(H264Context *h){
3485     MpegEncContext * const s = &h->s;
3486     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3487     int field_poc[2];
3488     Picture *cur = s->current_picture_ptr;
3489
3490     h->frame_num_offset= h->prev_frame_num_offset;
3491     if(h->frame_num < h->prev_frame_num)
3492         h->frame_num_offset += max_frame_num;
3493
3494     if(h->sps.poc_type==0){
3495         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3496
3497         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3498             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3499         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3500             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3501         else
3502             h->poc_msb = h->prev_poc_msb;
3503 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3504         field_poc[0] =
3505         field_poc[1] = h->poc_msb + h->poc_lsb;
3506         if(s->picture_structure == PICT_FRAME)
3507             field_poc[1] += h->delta_poc_bottom;
3508     }else if(h->sps.poc_type==1){
3509         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3510         int i;
3511
3512         if(h->sps.poc_cycle_length != 0)
3513             abs_frame_num = h->frame_num_offset + h->frame_num;
3514         else
3515             abs_frame_num = 0;
3516
3517         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3518             abs_frame_num--;
3519
3520         expected_delta_per_poc_cycle = 0;
3521         for(i=0; i < h->sps.poc_cycle_length; i++)
3522             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3523
3524         if(abs_frame_num > 0){
3525             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3526             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3527
3528             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3529             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3530                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3531         } else
3532             expectedpoc = 0;
3533
3534         if(h->nal_ref_idc == 0)
3535             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3536
3537         field_poc[0] = expectedpoc + h->delta_poc[0];
3538         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3539
3540         if(s->picture_structure == PICT_FRAME)
3541             field_poc[1] += h->delta_poc[1];
3542     }else{
3543         int poc= 2*(h->frame_num_offset + h->frame_num);
3544
3545         if(!h->nal_ref_idc)
3546             poc--;
3547
3548         field_poc[0]= poc;
3549         field_poc[1]= poc;
3550     }
3551
3552     if(s->picture_structure != PICT_BOTTOM_FIELD)
3553         s->current_picture_ptr->field_poc[0]= field_poc[0];
3554     if(s->picture_structure != PICT_TOP_FIELD)
3555         s->current_picture_ptr->field_poc[1]= field_poc[1];
3556     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3557
3558     return 0;
3559 }
3560
3561
3562 /**
3563  * initialize scan tables
3564  */
3565 static void init_scan_tables(H264Context *h){
3566     MpegEncContext * const s = &h->s;
3567     int i;
3568     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3569         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3570         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3571     }else{
3572         for(i=0; i<16; i++){
3573 #define T(x) (x>>2) | ((x<<2) & 0xF)
3574             h->zigzag_scan[i] = T(zigzag_scan[i]);
3575             h-> field_scan[i] = T( field_scan[i]);
3576 #undef T
3577         }
3578     }
3579     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3580         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3581         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3582         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3583         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3584     }else{
3585         for(i=0; i<64; i++){
3586 #define T(x) (x>>3) | ((x&7)<<3)
3587             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3588             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3589             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3590             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3591 #undef T
3592         }
3593     }
3594     if(h->sps.transform_bypass){ //FIXME same ugly
3595         h->zigzag_scan_q0          = zigzag_scan;
3596         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3597         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3598         h->field_scan_q0           = field_scan;
3599         h->field_scan8x8_q0        = field_scan8x8;
3600         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3601     }else{
3602         h->zigzag_scan_q0          = h->zigzag_scan;
3603         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3604         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3605         h->field_scan_q0           = h->field_scan;
3606         h->field_scan8x8_q0        = h->field_scan8x8;
3607         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3608     }
3609 }
3610
3611 /**
3612  * Replicates H264 "master" context to thread contexts.
3613  */
3614 static void clone_slice(H264Context *dst, H264Context *src)
3615 {
3616     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3617     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3618     dst->s.current_picture      = src->s.current_picture;
3619     dst->s.linesize             = src->s.linesize;
3620     dst->s.uvlinesize           = src->s.uvlinesize;
3621     dst->s.first_field          = src->s.first_field;
3622
3623     dst->prev_poc_msb           = src->prev_poc_msb;
3624     dst->prev_poc_lsb           = src->prev_poc_lsb;
3625     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3626     dst->prev_frame_num         = src->prev_frame_num;
3627     dst->short_ref_count        = src->short_ref_count;
3628
3629     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3630     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3631     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3632     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3633
3634     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3635     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3636 }
3637
3638 /**
3639  * decodes a slice header.
3640  * This will also call MPV_common_init() and frame_start() as needed.
3641  *
3642  * @param h h264context
3643  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3644  *
3645  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3646  */
3647 static int decode_slice_header(H264Context *h, H264Context *h0){
3648     MpegEncContext * const s = &h->s;
3649     MpegEncContext * const s0 = &h0->s;
3650     unsigned int first_mb_in_slice;
3651     unsigned int pps_id;
3652     int num_ref_idx_active_override_flag;
3653     unsigned int slice_type, tmp, i, j;
3654     int default_ref_list_done = 0;
3655     int last_pic_structure;
3656
3657     s->dropable= h->nal_ref_idc == 0;
3658
3659     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3660         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3661         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3662     }else{
3663         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3664         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3665     }
3666
3667     first_mb_in_slice= get_ue_golomb(&s->gb);
3668
3669     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3670         h0->current_slice = 0;
3671         if (!s0->first_field)
3672             s->current_picture_ptr= NULL;
3673     }
3674
3675     slice_type= get_ue_golomb_31(&s->gb);
3676     if(slice_type > 9){
3677         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3678         return -1;
3679     }
3680     if(slice_type > 4){
3681         slice_type -= 5;
3682         h->slice_type_fixed=1;
3683     }else
3684         h->slice_type_fixed=0;
3685
3686     slice_type= golomb_to_pict_type[ slice_type ];
3687     if (slice_type == FF_I_TYPE
3688         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3689         default_ref_list_done = 1;
3690     }
3691     h->slice_type= slice_type;
3692     h->slice_type_nos= slice_type & 3;
3693
3694     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3695     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3696         av_log(h->s.avctx, AV_LOG_ERROR,
3697                "B picture before any references, skipping\n");
3698         return -1;
3699     }
3700
3701     pps_id= get_ue_golomb(&s->gb);
3702     if(pps_id>=MAX_PPS_COUNT){
3703         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3704         return -1;
3705     }
3706     if(!h0->pps_buffers[pps_id]) {
3707         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3708         return -1;
3709     }
3710     h->pps= *h0->pps_buffers[pps_id];
3711
3712     if(!h0->sps_buffers[h->pps.sps_id]) {
3713         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3714         return -1;
3715     }
3716     h->sps = *h0->sps_buffers[h->pps.sps_id];
3717
3718     if(h == h0 && h->dequant_coeff_pps != pps_id){
3719         h->dequant_coeff_pps = pps_id;
3720         init_dequant_tables(h);
3721     }
3722
3723     s->mb_width= h->sps.mb_width;
3724     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3725
3726     h->b_stride=  s->mb_width*4;
3727     h->b8_stride= s->mb_width*2;
3728
3729     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3730     if(h->sps.frame_mbs_only_flag)
3731         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3732     else
3733         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3734
3735     if (s->context_initialized
3736         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3737         if(h != h0)
3738             return -1;   // width / height changed during parallelized decoding
3739         free_tables(h);
3740         flush_dpb(s->avctx);
3741         MPV_common_end(s);
3742     }
3743     if (!s->context_initialized) {
3744         if(h != h0)
3745             return -1;  // we cant (re-)initialize context during parallel decoding
3746         if (MPV_common_init(s) < 0)
3747             return -1;
3748         s->first_field = 0;
3749
3750         init_scan_tables(h);
3751         alloc_tables(h);
3752
3753         for(i = 1; i < s->avctx->thread_count; i++) {
3754             H264Context *c;
3755             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3756             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3757             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3758             c->sps = h->sps;
3759             c->pps = h->pps;
3760             init_scan_tables(c);
3761             clone_tables(c, h);
3762         }
3763
3764         for(i = 0; i < s->avctx->thread_count; i++)
3765             if(context_init(h->thread_context[i]) < 0)
3766                 return -1;
3767
3768         s->avctx->width = s->width;
3769         s->avctx->height = s->height;
3770         s->avctx->sample_aspect_ratio= h->sps.sar;
3771         if(!s->avctx->sample_aspect_ratio.den)
3772             s->avctx->sample_aspect_ratio.den = 1;
3773
3774         if(h->sps.timing_info_present_flag){
3775             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3776             if(h->x264_build > 0 && h->x264_build < 44)
3777                 s->avctx->time_base.den *= 2;
3778             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3779                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3780         }
3781     }
3782
3783     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3784
3785     h->mb_mbaff = 0;
3786     h->mb_aff_frame = 0;
3787     last_pic_structure = s0->picture_structure;
3788     if(h->sps.frame_mbs_only_flag){
3789         s->picture_structure= PICT_FRAME;
3790     }else{
3791         if(get_bits1(&s->gb)) { //field_pic_flag
3792             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3793         } else {
3794             s->picture_structure= PICT_FRAME;
3795             h->mb_aff_frame = h->sps.mb_aff;
3796         }
3797     }
3798     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3799
3800     if(h0->current_slice == 0){
3801         while(h->frame_num !=  h->prev_frame_num &&
3802               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3803             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3804             frame_start(h);
3805             h->prev_frame_num++;
3806             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3807             s->current_picture_ptr->frame_num= h->prev_frame_num;
3808             execute_ref_pic_marking(h, NULL, 0);
3809         }
3810
3811         /* See if we have a decoded first field looking for a pair... */
3812         if (s0->first_field) {
3813             assert(s0->current_picture_ptr);
3814             assert(s0->current_picture_ptr->data[0]);
3815             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3816
3817             /* figure out if we have a complementary field pair */
3818             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3819                 /*
3820                  * Previous field is unmatched. Don't display it, but let it
3821                  * remain for reference if marked as such.
3822                  */
3823                 s0->current_picture_ptr = NULL;
3824                 s0->first_field = FIELD_PICTURE;
3825
3826             } else {
3827                 if (h->nal_ref_idc &&
3828                         s0->current_picture_ptr->reference &&
3829                         s0->current_picture_ptr->frame_num != h->frame_num) {
3830                     /*
3831                      * This and previous field were reference, but had
3832                      * different frame_nums. Consider this field first in
3833                      * pair. Throw away previous field except for reference
3834                      * purposes.
3835                      */
3836                     s0->first_field = 1;
3837                     s0->current_picture_ptr = NULL;
3838
3839                 } else {
3840                     /* Second field in complementary pair */
3841                     s0->first_field = 0;
3842                 }
3843             }
3844
3845         } else {
3846             /* Frame or first field in a potentially complementary pair */
3847             assert(!s0->current_picture_ptr);
3848             s0->first_field = FIELD_PICTURE;
3849         }
3850
3851         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3852             s0->first_field = 0;
3853             return -1;
3854         }
3855     }
3856     if(h != h0)
3857         clone_slice(h, h0);
3858
3859     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3860
3861     assert(s->mb_num == s->mb_width * s->mb_height);
3862     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3863        first_mb_in_slice                    >= s->mb_num){
3864         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3865         return -1;
3866     }
3867     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3868     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3869     if (s->picture_structure == PICT_BOTTOM_FIELD)
3870         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3871     assert(s->mb_y < s->mb_height);
3872
3873     if(s->picture_structure==PICT_FRAME){
3874         h->curr_pic_num=   h->frame_num;
3875         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3876     }else{
3877         h->curr_pic_num= 2*h->frame_num + 1;
3878         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3879     }
3880
3881     if(h->nal_unit_type == NAL_IDR_SLICE){
3882         get_ue_golomb(&s->gb); /* idr_pic_id */
3883     }
3884
3885     if(h->sps.poc_type==0){
3886         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3887
3888         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3889             h->delta_poc_bottom= get_se_golomb(&s->gb);
3890         }
3891     }
3892
3893     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3894         h->delta_poc[0]= get_se_golomb(&s->gb);
3895
3896         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3897             h->delta_poc[1]= get_se_golomb(&s->gb);
3898     }
3899
3900     init_poc(h);
3901
3902     if(h->pps.redundant_pic_cnt_present){
3903         h->redundant_pic_count= get_ue_golomb(&s->gb);
3904     }
3905
3906     //set defaults, might be overridden a few lines later
3907     h->ref_count[0]= h->pps.ref_count[0];
3908     h->ref_count[1]= h->pps.ref_count[1];
3909
3910     if(h->slice_type_nos != FF_I_TYPE){
3911         if(h->slice_type_nos == FF_B_TYPE){
3912             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3913         }
3914         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3915
3916         if(num_ref_idx_active_override_flag){
3917             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3918             if(h->slice_type_nos==FF_B_TYPE)
3919                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3920
3921             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3922                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3923                 h->ref_count[0]= h->ref_count[1]= 1;
3924                 return -1;
3925             }
3926         }
3927         if(h->slice_type_nos == FF_B_TYPE)
3928             h->list_count= 2;
3929         else
3930             h->list_count= 1;
3931     }else
3932         h->list_count= 0;
3933
3934     if(!default_ref_list_done){
3935         fill_default_ref_list(h);
3936     }
3937
3938     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3939         return -1;
3940
3941     if(h->slice_type_nos!=FF_I_TYPE){
3942         s->last_picture_ptr= &h->ref_list[0][0];
3943         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3944     }
3945     if(h->slice_type_nos==FF_B_TYPE){
3946         s->next_picture_ptr= &h->ref_list[1][0];
3947         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3948     }
3949
3950     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3951        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3952         pred_weight_table(h);
3953     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3954         implicit_weight_table(h);
3955     else {
3956         h->use_weight = 0;
3957         for (i = 0; i < 2; i++) {
3958             h->luma_weight_flag[i]   = 0;
3959             h->chroma_weight_flag[i] = 0;
3960         }
3961     }
3962
3963     if(h->nal_ref_idc)
3964         decode_ref_pic_marking(h0, &s->gb);
3965
3966     if(FRAME_MBAFF)
3967         fill_mbaff_ref_list(h);
3968
3969     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3970         direct_dist_scale_factor(h);
3971     direct_ref_list_init(h);
3972
3973     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3974         tmp = get_ue_golomb_31(&s->gb);
3975         if(tmp > 2){
3976             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3977             return -1;
3978         }
3979         h->cabac_init_idc= tmp;
3980     }
3981
3982     h->last_qscale_diff = 0;
3983     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3984     if(tmp>51){
3985         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3986         return -1;
3987     }
3988     s->qscale= tmp;
3989     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3990     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3991     //FIXME qscale / qp ... stuff
3992     if(h->slice_type == FF_SP_TYPE){
3993         get_bits1(&s->gb); /* sp_for_switch_flag */
3994     }
3995     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3996         get_se_golomb(&s->gb); /* slice_qs_delta */
3997     }
3998
3999     h->deblocking_filter = 1;
4000     h->slice_alpha_c0_offset = 0;
4001     h->slice_beta_offset = 0;
4002     if( h->pps.deblocking_filter_parameters_present ) {
4003         tmp= get_ue_golomb_31(&s->gb);
4004         if(tmp > 2){
4005             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4006             return -1;
4007         }
4008         h->deblocking_filter= tmp;
4009         if(h->deblocking_filter < 2)
4010             h->deblocking_filter^= 1; // 1<->0
4011
4012         if( h->deblocking_filter ) {
4013             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4014             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4015         }
4016     }
4017
4018     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4019        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4020        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4021        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4022         h->deblocking_filter= 0;
4023
4024     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4025         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4026             /* Cheat slightly for speed:
4027                Do not bother to deblock across slices. */
4028             h->deblocking_filter = 2;
4029         } else {
4030             h0->max_contexts = 1;
4031             if(!h0->single_decode_warning) {
4032                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4033                 h0->single_decode_warning = 1;
4034             }
4035             if(h != h0)
4036                 return 1; // deblocking switched inside frame
4037         }
4038     }
4039
4040 #if 0 //FMO
4041     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4042         slice_group_change_cycle= get_bits(&s->gb, ?);
4043 #endif
4044
4045     h0->last_slice_type = slice_type;
4046     h->slice_num = ++h0->current_slice;
4047     if(h->slice_num >= MAX_SLICES){
4048         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4049     }
4050
4051     for(j=0; j<2; j++){
4052         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4053         ref2frm[0]=
4054         ref2frm[1]= -1;
4055         for(i=0; i<16; i++)
4056             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4057                           +(h->ref_list[j][i].reference&3);
4058         ref2frm[18+0]=
4059         ref2frm[18+1]= -1;
4060         for(i=16; i<48; i++)
4061             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4062                           +(h->ref_list[j][i].reference&3);
4063     }
4064
4065     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4066     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4067
4068     s->avctx->refs= h->sps.ref_frame_count;
4069
4070     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4071         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4072                h->slice_num,
4073                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4074                first_mb_in_slice,
4075                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4076                pps_id, h->frame_num,
4077                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4078                h->ref_count[0], h->ref_count[1],
4079                s->qscale,
4080                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4081                h->use_weight,
4082                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4083                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4084                );
4085     }
4086
4087     return 0;
4088 }
4089
4090 /**
4091  *
4092  */
4093 static inline int get_level_prefix(GetBitContext *gb){
4094     unsigned int buf;
4095     int log;
4096
4097     OPEN_READER(re, gb);
4098     UPDATE_CACHE(re, gb);
4099     buf=GET_CACHE(re, gb);
4100
4101     log= 32 - av_log2(buf);
4102 #ifdef TRACE
4103     print_bin(buf>>(32-log), log);
4104     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4105 #endif
4106
4107     LAST_SKIP_BITS(re, gb, log);
4108     CLOSE_READER(re, gb);
4109
4110     return log-1;
4111 }
4112
4113 static inline int get_dct8x8_allowed(H264Context *h){
4114     if(h->sps.direct_8x8_inference_flag)
4115         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4116     else
4117         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4118 }
4119
4120 /**
4121  * decodes a residual block.
4122  * @param n block index
4123  * @param scantable scantable
4124  * @param max_coeff number of coefficients in the block
4125  * @return <0 if an error occurred
4126  */
4127 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4128     MpegEncContext * const s = &h->s;
4129     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4130     int level[16];
4131     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4132
4133     //FIXME put trailing_onex into the context
4134
4135     if(n == CHROMA_DC_BLOCK_INDEX){
4136         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4137         total_coeff= coeff_token>>2;
4138     }else{
4139         if(n == LUMA_DC_BLOCK_INDEX){
4140             total_coeff= pred_non_zero_count(h, 0);
4141             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4142             total_coeff= coeff_token>>2;
4143         }else{
4144             total_coeff= pred_non_zero_count(h, n);
4145             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4146             total_coeff= coeff_token>>2;
4147             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4148         }
4149     }
4150
4151     //FIXME set last_non_zero?
4152
4153     if(total_coeff==0)
4154         return 0;
4155     if(total_coeff > (unsigned)max_coeff) {
4156         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4157         return -1;
4158     }
4159
4160     trailing_ones= coeff_token&3;
4161     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4162     assert(total_coeff<=16);
4163
4164     i = show_bits(gb, 3);
4165     skip_bits(gb, trailing_ones);
4166     level[0] = 1-((i&4)>>1);
4167     level[1] = 1-((i&2)   );
4168     level[2] = 1-((i&1)<<1);
4169
4170     if(trailing_ones<total_coeff) {
4171         int mask, prefix;
4172         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4173         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4174         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4175
4176         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4177         if(level_code >= 100){
4178             prefix= level_code - 100;
4179             if(prefix == LEVEL_TAB_BITS)
4180                 prefix += get_level_prefix(gb);
4181
4182             //first coefficient has suffix_length equal to 0 or 1
4183             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4184                 if(suffix_length)
4185                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4186                 else
4187                     level_code= (prefix<<suffix_length); //part
4188             }else if(prefix==14){
4189                 if(suffix_length)
4190                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4191                 else
4192                     level_code= prefix + get_bits(gb, 4); //part
4193             }else{
4194                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4195                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4196                 if(prefix>=16)
4197                     level_code += (1<<(prefix-3))-4096;
4198             }
4199
4200             if(trailing_ones < 3) level_code += 2;
4201
4202             suffix_length = 2;
4203             mask= -(level_code&1);
4204             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4205         }else{
4206             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4207
4208             suffix_length = 1;
4209             if(level_code + 3U > 6U)
4210                 suffix_length++;
4211             level[trailing_ones]= level_code;
4212         }
4213
4214         //remaining coefficients have suffix_length > 0
4215         for(i=trailing_ones+1;i<total_coeff;i++) {
4216             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4217             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4218             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4219
4220             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4221             if(level_code >= 100){
4222                 prefix= level_code - 100;
4223                 if(prefix == LEVEL_TAB_BITS){
4224                     prefix += get_level_prefix(gb);
4225                 }
4226                 if(prefix<15){
4227                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4228                 }else{
4229                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4230                     if(prefix>=16)
4231                         level_code += (1<<(prefix-3))-4096;
4232                 }
4233                 mask= -(level_code&1);
4234                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4235             }
4236             level[i]= level_code;
4237
4238             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4239                 suffix_length++;
4240         }
4241     }
4242
4243     if(total_coeff == max_coeff)
4244         zeros_left=0;
4245     else{
4246         if(n == CHROMA_DC_BLOCK_INDEX)
4247             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4248         else
4249             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4250     }
4251
4252     coeff_num = zeros_left + total_coeff - 1;
4253     j = scantable[coeff_num];
4254     if(n > 24){
4255         block[j] = level[0];
4256         for(i=1;i<total_coeff;i++) {
4257             if(zeros_left <= 0)
4258                 run_before = 0;
4259             else if(zeros_left < 7){
4260                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4261             }else{
4262                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4263             }
4264             zeros_left -= run_before;
4265             coeff_num -= 1 + run_before;
4266             j= scantable[ coeff_num ];
4267
4268             block[j]= level[i];
4269         }
4270     }else{
4271         block[j] = (level[0] * qmul[j] + 32)>>6;
4272         for(i=1;i<total_coeff;i++) {
4273             if(zeros_left <= 0)
4274                 run_before = 0;
4275             else if(zeros_left < 7){
4276                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4277             }else{
4278                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4279             }
4280             zeros_left -= run_before;
4281             coeff_num -= 1 + run_before;
4282             j= scantable[ coeff_num ];
4283
4284             block[j]= (level[i] * qmul[j] + 32)>>6;
4285         }
4286     }
4287
4288     if(zeros_left<0){
4289         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4290         return -1;
4291     }
4292
4293     return 0;
4294 }
4295
4296 static void predict_field_decoding_flag(H264Context *h){
4297     MpegEncContext * const s = &h->s;
4298     const int mb_xy= h->mb_xy;
4299     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4300                 ? s->current_picture.mb_type[mb_xy-1]
4301                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4302                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4303                 : 0;
4304     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4305 }
4306
4307 /**
4308  * decodes a P_SKIP or B_SKIP macroblock
4309  */
4310 static void decode_mb_skip(H264Context *h){
4311     MpegEncContext * const s = &h->s;
4312     const int mb_xy= h->mb_xy;
4313     int mb_type=0;
4314
4315     memset(h->non_zero_count[mb_xy], 0, 16);
4316     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4317
4318     if(MB_FIELD)
4319         mb_type|= MB_TYPE_INTERLACED;
4320
4321     if( h->slice_type_nos == FF_B_TYPE )
4322     {
4323         // just for fill_caches. pred_direct_motion will set the real mb_type
4324         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4325
4326         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4327         pred_direct_motion(h, &mb_type);
4328         mb_type|= MB_TYPE_SKIP;
4329     }
4330     else
4331     {
4332         int mx, my;
4333         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4334
4335         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4336         pred_pskip_motion(h, &mx, &my);
4337         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4338         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4339     }
4340
4341     write_back_motion(h, mb_type);
4342     s->current_picture.mb_type[mb_xy]= mb_type;
4343     s->current_picture.qscale_table[mb_xy]= s->qscale;
4344     h->slice_table[ mb_xy ]= h->slice_num;
4345     h->prev_mb_skipped= 1;
4346 }
4347
4348 /**
4349  * decodes a macroblock
4350  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4351  */
4352 static int decode_mb_cavlc(H264Context *h){
4353     MpegEncContext * const s = &h->s;
4354     int mb_xy;
4355     int partition_count;
4356     unsigned int mb_type, cbp;
4357     int dct8x8_allowed= h->pps.transform_8x8_mode;
4358
4359     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4360
4361     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4362     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4363                 down the code */
4364     if(h->slice_type_nos != FF_I_TYPE){
4365         if(s->mb_skip_run==-1)
4366             s->mb_skip_run= get_ue_golomb(&s->gb);
4367
4368         if (s->mb_skip_run--) {
4369             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4370                 if(s->mb_skip_run==0)
4371                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4372                 else
4373                     predict_field_decoding_flag(h);
4374             }
4375             decode_mb_skip(h);
4376             return 0;
4377         }
4378     }
4379     if(FRAME_MBAFF){
4380         if( (s->mb_y&1) == 0 )
4381             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4382     }
4383
4384     h->prev_mb_skipped= 0;
4385
4386     mb_type= get_ue_golomb(&s->gb);
4387     if(h->slice_type_nos == FF_B_TYPE){
4388         if(mb_type < 23){
4389             partition_count= b_mb_type_info[mb_type].partition_count;
4390             mb_type=         b_mb_type_info[mb_type].type;
4391         }else{
4392             mb_type -= 23;
4393             goto decode_intra_mb;
4394         }
4395     }else if(h->slice_type_nos == FF_P_TYPE){
4396         if(mb_type < 5){
4397             partition_count= p_mb_type_info[mb_type].partition_count;
4398             mb_type=         p_mb_type_info[mb_type].type;
4399         }else{
4400             mb_type -= 5;
4401             goto decode_intra_mb;
4402         }
4403     }else{
4404        assert(h->slice_type_nos == FF_I_TYPE);
4405         if(h->slice_type == FF_SI_TYPE && mb_type)
4406             mb_type--;
4407 decode_intra_mb:
4408         if(mb_type > 25){
4409             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4410             return -1;
4411         }
4412         partition_count=0;
4413         cbp= i_mb_type_info[mb_type].cbp;
4414         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4415         mb_type= i_mb_type_info[mb_type].type;
4416     }
4417
4418     if(MB_FIELD)
4419         mb_type |= MB_TYPE_INTERLACED;
4420
4421     h->slice_table[ mb_xy ]= h->slice_num;
4422
4423     if(IS_INTRA_PCM(mb_type)){
4424         unsigned int x;
4425
4426         // We assume these blocks are very rare so we do not optimize it.
4427         align_get_bits(&s->gb);
4428
4429         // The pixels are stored in the same order as levels in h->mb array.
4430         for(x=0; x < (CHROMA ? 384 : 256); x++){
4431             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4432         }
4433
4434         // In deblocking, the quantizer is 0
4435         s->current_picture.qscale_table[mb_xy]= 0;
4436         // All coeffs are present
4437         memset(h->non_zero_count[mb_xy], 16, 16);
4438
4439         s->current_picture.mb_type[mb_xy]= mb_type;
4440         return 0;
4441     }
4442
4443     if(MB_MBAFF){
4444         h->ref_count[0] <<= 1;
4445         h->ref_count[1] <<= 1;
4446     }
4447
4448     fill_caches(h, mb_type, 0);
4449
4450     //mb_pred
4451     if(IS_INTRA(mb_type)){
4452         int pred_mode;
4453 //            init_top_left_availability(h);
4454         if(IS_INTRA4x4(mb_type)){
4455             int i;
4456             int di = 1;
4457             if(dct8x8_allowed && get_bits1(&s->gb)){
4458                 mb_type |= MB_TYPE_8x8DCT;
4459                 di = 4;
4460             }
4461
4462 //                fill_intra4x4_pred_table(h);
4463             for(i=0; i<16; i+=di){
4464                 int mode= pred_intra_mode(h, i);
4465
4466                 if(!get_bits1(&s->gb)){
4467                     const int rem_mode= get_bits(&s->gb, 3);
4468                     mode = rem_mode + (rem_mode >= mode);
4469                 }
4470
4471                 if(di==4)
4472                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4473                 else
4474                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4475             }
4476             write_back_intra_pred_mode(h);
4477             if( check_intra4x4_pred_mode(h) < 0)
4478                 return -1;
4479         }else{
4480             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4481             if(h->intra16x16_pred_mode < 0)
4482                 return -1;
4483         }
4484         if(CHROMA){
4485             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4486             if(pred_mode < 0)
4487                 return -1;
4488             h->chroma_pred_mode= pred_mode;
4489         }
4490     }else if(partition_count==4){
4491         int i, j, sub_partition_count[4], list, ref[2][4];
4492
4493         if(h->slice_type_nos == FF_B_TYPE){
4494             for(i=0; i<4; i++){
4495                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4496                 if(h->sub_mb_type[i] >=13){
4497                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4498                     return -1;
4499                 }
4500                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4501                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4502             }
4503             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4504                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4505                 pred_direct_motion(h, &mb_type);
4506                 h->ref_cache[0][scan8[4]] =
4507                 h->ref_cache[1][scan8[4]] =
4508                 h->ref_cache[0][scan8[12]] =
4509                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4510             }
4511         }else{
4512             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4513             for(i=0; i<4; i++){
4514                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4515                 if(h->sub_mb_type[i] >=4){
4516                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4517                     return -1;
4518                 }
4519                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4520                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4521             }
4522         }
4523
4524         for(list=0; list<h->list_count; list++){
4525             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4526             for(i=0; i<4; i++){
4527                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4528                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4529                     unsigned int tmp;
4530                     if(ref_count == 1){
4531                         tmp= 0;
4532                     }else if(ref_count == 2){
4533                         tmp= get_bits1(&s->gb)^1;
4534                     }else{
4535                         tmp= get_ue_golomb_31(&s->gb);
4536                         if(tmp>=ref_count){
4537                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4538                             return -1;
4539                         }
4540                     }
4541                     ref[list][i]= tmp;
4542                 }else{
4543                  //FIXME
4544                     ref[list][i] = -1;
4545                 }
4546             }
4547         }
4548
4549         if(dct8x8_allowed)
4550             dct8x8_allowed = get_dct8x8_allowed(h);
4551
4552         for(list=0; list<h->list_count; list++){
4553             for(i=0; i<4; i++){
4554                 if(IS_DIRECT(h->sub_mb_type[i])) {
4555                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4556                     continue;
4557                 }
4558                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4559                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4560
4561                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4562                     const int sub_mb_type= h->sub_mb_type[i];
4563                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4564                     for(j=0; j<sub_partition_count[i]; j++){
4565                         int mx, my;
4566                         const int index= 4*i + block_width*j;
4567                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4568                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4569                         mx += get_se_golomb(&s->gb);
4570                         my += get_se_golomb(&s->gb);
4571                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4572
4573                         if(IS_SUB_8X8(sub_mb_type)){
4574                             mv_cache[ 1 ][0]=
4575                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4576                             mv_cache[ 1 ][1]=
4577                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4578                         }else if(IS_SUB_8X4(sub_mb_type)){
4579                             mv_cache[ 1 ][0]= mx;
4580                             mv_cache[ 1 ][1]= my;
4581                         }else if(IS_SUB_4X8(sub_mb_type)){
4582                             mv_cache[ 8 ][0]= mx;
4583                             mv_cache[ 8 ][1]= my;
4584                         }
4585                         mv_cache[ 0 ][0]= mx;
4586                         mv_cache[ 0 ][1]= my;
4587                     }
4588                 }else{
4589                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4590                     p[0] = p[1]=
4591                     p[8] = p[9]= 0;
4592                 }
4593             }
4594         }
4595     }else if(IS_DIRECT(mb_type)){
4596         pred_direct_motion(h, &mb_type);
4597         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4598     }else{
4599         int list, mx, my, i;
4600          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4601         if(IS_16X16(mb_type)){
4602             for(list=0; list<h->list_count; list++){
4603                     unsigned int val;
4604                     if(IS_DIR(mb_type, 0, list)){
4605                         if(h->ref_count[list]==1){
4606                             val= 0;
4607                         }else if(h->ref_count[list]==2){
4608                             val= get_bits1(&s->gb)^1;
4609                         }else{
4610                             val= get_ue_golomb_31(&s->gb);
4611                             if(val >= h->ref_count[list]){
4612                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4613                                 return -1;
4614                             }
4615                         }
4616                     }else
4617                         val= LIST_NOT_USED&0xFF;
4618                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4619             }
4620             for(list=0; list<h->list_count; list++){
4621                 unsigned int val;
4622                 if(IS_DIR(mb_type, 0, list)){
4623                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4624                     mx += get_se_golomb(&s->gb);
4625                     my += get_se_golomb(&s->gb);
4626                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4627
4628                     val= pack16to32(mx,my);
4629                 }else
4630                     val=0;
4631                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4632             }
4633         }
4634         else if(IS_16X8(mb_type)){
4635             for(list=0; list<h->list_count; list++){
4636                     for(i=0; i<2; i++){
4637                         unsigned int val;
4638                         if(IS_DIR(mb_type, i, list)){
4639                             if(h->ref_count[list] == 1){
4640                                 val= 0;
4641                             }else if(h->ref_count[list] == 2){
4642                                 val= get_bits1(&s->gb)^1;
4643                             }else{
4644                                 val= get_ue_golomb_31(&s->gb);
4645                                 if(val >= h->ref_count[list]){
4646                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4647                                     return -1;
4648                                 }
4649                             }
4650                         }else
4651                             val= LIST_NOT_USED&0xFF;
4652                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4653                     }
4654             }
4655             for(list=0; list<h->list_count; list++){
4656                 for(i=0; i<2; i++){
4657                     unsigned int val;
4658                     if(IS_DIR(mb_type, i, list)){
4659                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4660                         mx += get_se_golomb(&s->gb);
4661                         my += get_se_golomb(&s->gb);
4662                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4663
4664                         val= pack16to32(mx,my);
4665                     }else
4666                         val=0;
4667                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4668                 }
4669             }
4670         }else{
4671             assert(IS_8X16(mb_type));
4672             for(list=0; list<h->list_count; list++){
4673                     for(i=0; i<2; i++){
4674                         unsigned int val;
4675                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4676                             if(h->ref_count[list]==1){
4677                                 val= 0;
4678                             }else if(h->ref_count[list]==2){
4679                                 val= get_bits1(&s->gb)^1;
4680                             }else{
4681                                 val= get_ue_golomb_31(&s->gb);
4682                                 if(val >= h->ref_count[list]){
4683                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4684                                     return -1;
4685                                 }
4686                             }
4687                         }else
4688                             val= LIST_NOT_USED&0xFF;
4689                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4690                     }
4691             }
4692             for(list=0; list<h->list_count; list++){
4693                 for(i=0; i<2; i++){
4694                     unsigned int val;
4695                     if(IS_DIR(mb_type, i, list)){
4696                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4697                         mx += get_se_golomb(&s->gb);
4698                         my += get_se_golomb(&s->gb);
4699                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4700
4701                         val= pack16to32(mx,my);
4702                     }else
4703                         val=0;
4704                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4705                 }
4706             }
4707         }
4708     }
4709
4710     if(IS_INTER(mb_type))
4711         write_back_motion(h, mb_type);
4712
4713     if(!IS_INTRA16x16(mb_type)){
4714         cbp= get_ue_golomb(&s->gb);
4715         if(cbp > 47){
4716             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4717             return -1;
4718         }
4719
4720         if(CHROMA){
4721             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4722             else                     cbp= golomb_to_inter_cbp   [cbp];
4723         }else{
4724             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4725             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4726         }
4727     }
4728     h->cbp = cbp;
4729
4730     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4731         if(get_bits1(&s->gb)){
4732             mb_type |= MB_TYPE_8x8DCT;
4733             h->cbp_table[mb_xy]= cbp;
4734         }
4735     }
4736     s->current_picture.mb_type[mb_xy]= mb_type;
4737
4738     if(cbp || IS_INTRA16x16(mb_type)){
4739         int i8x8, i4x4, chroma_idx;
4740         int dquant;
4741         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4742         const uint8_t *scan, *scan8x8, *dc_scan;
4743
4744 //        fill_non_zero_count_cache(h);
4745
4746         if(IS_INTERLACED(mb_type)){
4747             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4748             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4749             dc_scan= luma_dc_field_scan;
4750         }else{
4751             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4752             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4753             dc_scan= luma_dc_zigzag_scan;
4754         }
4755
4756         dquant= get_se_golomb(&s->gb);
4757
4758         if( dquant > 25 || dquant < -26 ){
4759             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4760             return -1;
4761         }
4762
4763         s->qscale += dquant;
4764         if(((unsigned)s->qscale) > 51){
4765             if(s->qscale<0) s->qscale+= 52;
4766             else            s->qscale-= 52;
4767         }
4768
4769         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4770         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4771         if(IS_INTRA16x16(mb_type)){
4772             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4773                 return -1; //FIXME continue if partitioned and other return -1 too
4774             }
4775
4776             assert((cbp&15) == 0 || (cbp&15) == 15);
4777
4778             if(cbp&15){
4779                 for(i8x8=0; i8x8<4; i8x8++){
4780                     for(i4x4=0; i4x4<4; i4x4++){
4781                         const int index= i4x4 + 4*i8x8;
4782                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4783                             return -1;
4784                         }
4785                     }
4786                 }
4787             }else{
4788                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4789             }
4790         }else{
4791             for(i8x8=0; i8x8<4; i8x8++){
4792                 if(cbp & (1<<i8x8)){
4793                     if(IS_8x8DCT(mb_type)){
4794                         DCTELEM *buf = &h->mb[64*i8x8];
4795                         uint8_t *nnz;
4796                         for(i4x4=0; i4x4<4; i4x4++){
4797                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4798                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4799                                 return -1;
4800                         }
4801                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4802                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4803                     }else{
4804                         for(i4x4=0; i4x4<4; i4x4++){
4805                             const int index= i4x4 + 4*i8x8;
4806
4807                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4808                                 return -1;
4809                             }
4810                         }
4811                     }
4812                 }else{
4813                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4814                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4815                 }
4816             }
4817         }
4818
4819         if(cbp&0x30){
4820             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4821                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4822                     return -1;
4823                 }
4824         }
4825
4826         if(cbp&0x20){
4827             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4828                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4829                 for(i4x4=0; i4x4<4; i4x4++){
4830                     const int index= 16 + 4*chroma_idx + i4x4;
4831                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4832                         return -1;
4833                     }
4834                 }
4835             }
4836         }else{
4837             uint8_t * const nnz= &h->non_zero_count_cache[0];
4838             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4839             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4840         }
4841     }else{
4842         uint8_t * const nnz= &h->non_zero_count_cache[0];
4843         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4844         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4845         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4846     }
4847     s->current_picture.qscale_table[mb_xy]= s->qscale;
4848     write_back_non_zero_count(h);
4849
4850     if(MB_MBAFF){
4851         h->ref_count[0] >>= 1;
4852         h->ref_count[1] >>= 1;
4853     }
4854
4855     return 0;
4856 }
4857
4858 static int decode_cabac_field_decoding_flag(H264Context *h) {
4859     MpegEncContext * const s = &h->s;
4860     const int mb_x = s->mb_x;
4861     const int mb_y = s->mb_y & ~1;
4862     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4863     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4864
4865     unsigned int ctx = 0;
4866
4867     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4868         ctx += 1;
4869     }
4870     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4871         ctx += 1;
4872     }
4873
4874     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4875 }
4876
4877 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4878     uint8_t *state= &h->cabac_state[ctx_base];
4879     int mb_type;
4880
4881     if(intra_slice){
4882         MpegEncContext * const s = &h->s;
4883         const int mba_xy = h->left_mb_xy[0];
4884         const int mbb_xy = h->top_mb_xy;
4885         int ctx=0;
4886         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4887             ctx++;
4888         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4889             ctx++;
4890         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4891             return 0;   /* I4x4 */
4892         state += 2;
4893     }else{
4894         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4895             return 0;   /* I4x4 */
4896     }
4897
4898     if( get_cabac_terminate( &h->cabac ) )
4899         return 25;  /* PCM */
4900
4901     mb_type = 1; /* I16x16 */
4902     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4903     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4904         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4905     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4906     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4907     return mb_type;
4908 }
4909
4910 static int decode_cabac_mb_type_b( H264Context *h ) {
4911     MpegEncContext * const s = &h->s;
4912
4913         const int mba_xy = h->left_mb_xy[0];
4914         const int mbb_xy = h->top_mb_xy;
4915         int ctx = 0;
4916         int bits;
4917         assert(h->slice_type_nos == FF_B_TYPE);
4918
4919         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4920             ctx++;
4921         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4922             ctx++;
4923
4924         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4925             return 0; /* B_Direct_16x16 */
4926
4927         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4928             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4929         }
4930
4931         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4932         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4933         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4934         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4935         if( bits < 8 )
4936             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4937         else if( bits == 13 ) {
4938             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4939         } else if( bits == 14 )
4940             return 11; /* B_L1_L0_8x16 */
4941         else if( bits == 15 )
4942             return 22; /* B_8x8 */
4943
4944         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4945         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4946 }
4947
4948 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4949     MpegEncContext * const s = &h->s;
4950     int mba_xy, mbb_xy;
4951     int ctx = 0;
4952
4953     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4954         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4955         mba_xy = mb_xy - 1;
4956         if( (mb_y&1)
4957             && h->slice_table[mba_xy] == h->slice_num
4958             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4959             mba_xy += s->mb_stride;
4960         if( MB_FIELD ){
4961             mbb_xy = mb_xy - s->mb_stride;
4962             if( !(mb_y&1)
4963                 && h->slice_table[mbb_xy] == h->slice_num
4964                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4965                 mbb_xy -= s->mb_stride;
4966         }else
4967             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4968     }else{
4969         int mb_xy = h->mb_xy;
4970         mba_xy = mb_xy - 1;
4971         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4972     }
4973
4974     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4975         ctx++;
4976     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4977         ctx++;
4978
4979     if( h->slice_type_nos == FF_B_TYPE )
4980         ctx += 13;
4981     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4982 }
4983
4984 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4985     int mode = 0;
4986
4987     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4988         return pred_mode;
4989
4990     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4991     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4992     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4993
4994     if( mode >= pred_mode )
4995         return mode + 1;
4996     else
4997         return mode;
4998 }
4999
5000 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5001     const int mba_xy = h->left_mb_xy[0];
5002     const int mbb_xy = h->top_mb_xy;
5003
5004     int ctx = 0;
5005
5006     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5007     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5008         ctx++;
5009
5010     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5011         ctx++;
5012
5013     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5014         return 0;
5015
5016     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5017         return 1;
5018     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5019         return 2;
5020     else
5021         return 3;
5022 }
5023
5024 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5025     int cbp_b, cbp_a, ctx, cbp = 0;
5026
5027     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5028     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5029
5030     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5031     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5032     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5033     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5034     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5035     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5036     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5037     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5038     return cbp;
5039 }
5040 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5041     int ctx;
5042     int cbp_a, cbp_b;
5043
5044     cbp_a = (h->left_cbp>>4)&0x03;
5045     cbp_b = (h-> top_cbp>>4)&0x03;
5046
5047     ctx = 0;
5048     if( cbp_a > 0 ) ctx++;
5049     if( cbp_b > 0 ) ctx += 2;
5050     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5051         return 0;
5052
5053     ctx = 4;
5054     if( cbp_a == 2 ) ctx++;
5055     if( cbp_b == 2 ) ctx += 2;
5056     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5057 }
5058 static int decode_cabac_mb_dqp( H264Context *h) {
5059     int   ctx= h->last_qscale_diff != 0;
5060     int   val = 0;
5061
5062     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5063         ctx= 2+(ctx>>1);
5064         val++;
5065         if(val > 102) //prevent infinite loop
5066             return INT_MIN;
5067     }
5068
5069     if( val&0x01 )
5070         return   (val + 1)>>1 ;
5071     else
5072         return -((val + 1)>>1);
5073 }
5074 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5075     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5076         return 0;   /* 8x8 */
5077     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5078         return 1;   /* 8x4 */
5079     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5080         return 2;   /* 4x8 */
5081     return 3;       /* 4x4 */
5082 }
5083 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5084     int type;
5085     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5086         return 0;   /* B_Direct_8x8 */
5087     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5088         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5089     type = 3;
5090     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5091         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5092             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5093         type += 4;
5094     }
5095     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5096     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5097     return type;
5098 }
5099
5100 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5101     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5102 }
5103
5104 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5105     int refa = h->ref_cache[list][scan8[n] - 1];
5106     int refb = h->ref_cache[list][scan8[n] - 8];
5107     int ref  = 0;
5108     int ctx  = 0;
5109
5110     if( h->slice_type_nos == FF_B_TYPE) {
5111         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5112             ctx++;
5113         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5114             ctx += 2;
5115     } else {
5116         if( refa > 0 )
5117             ctx++;
5118         if( refb > 0 )
5119             ctx += 2;
5120     }
5121
5122     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5123         ref++;
5124         ctx = (ctx>>2)+4;
5125         if(ref >= 32 /*h->ref_list[list]*/){
5126             return -1;
5127         }
5128     }
5129     return ref;
5130 }
5131
5132 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5133     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5134                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5135     int ctxbase = (l == 0) ? 40 : 47;
5136     int mvd;
5137     int ctx = (amvd>2) + (amvd>32);
5138
5139     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5140         return 0;
5141
5142     mvd= 1;
5143     ctx= 3;
5144     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5145         mvd++;
5146         if( ctx < 6 )
5147             ctx++;
5148     }
5149
5150     if( mvd >= 9 ) {
5151         int k = 3;
5152         while( get_cabac_bypass( &h->cabac ) ) {
5153             mvd += 1 << k;
5154             k++;
5155             if(k>24){
5156                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5157                 return INT_MIN;
5158             }
5159         }
5160         while( k-- ) {
5161             if( get_cabac_bypass( &h->cabac ) )
5162                 mvd += 1 << k;
5163         }
5164     }
5165     return get_cabac_bypass_sign( &h->cabac, -mvd );
5166 }
5167
5168 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5169     int nza, nzb;
5170     int ctx = 0;
5171
5172     if( is_dc ) {
5173         if( cat == 0 ) {
5174             nza = h->left_cbp&0x100;
5175             nzb = h-> top_cbp&0x100;
5176         } else {
5177             nza = (h->left_cbp>>(6+idx))&0x01;
5178             nzb = (h-> top_cbp>>(6+idx))&0x01;
5179         }
5180     } else {
5181         assert(cat == 1 || cat == 2 || cat == 4);
5182         nza = h->non_zero_count_cache[scan8[idx] - 1];
5183         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5184     }
5185
5186     if( nza > 0 )
5187         ctx++;
5188
5189     if( nzb > 0 )
5190         ctx += 2;
5191
5192     return ctx + 4 * cat;
5193 }
5194
5195 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5196     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5197     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5198     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5199     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5200 };
5201
5202 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5203     static const int significant_coeff_flag_offset[2][6] = {
5204       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5205       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5206     };
5207     static const int last_coeff_flag_offset[2][6] = {
5208       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5209       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5210     };
5211     static const int coeff_abs_level_m1_offset[6] = {
5212         227+0, 227+10, 227+20, 227+30, 227+39, 426
5213     };
5214     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5215       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5216         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5217         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5218        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5219       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5220         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5221         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5222         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5223     };
5224     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5225      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5226      * map node ctx => cabac ctx for level=1 */
5227     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5228     /* map node ctx => cabac ctx for level>1 */
5229     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5230     static const uint8_t coeff_abs_level_transition[2][8] = {
5231     /* update node ctx after decoding a level=1 */
5232         { 1, 2, 3, 3, 4, 5, 6, 7 },
5233     /* update node ctx after decoding a level>1 */
5234         { 4, 4, 4, 4, 5, 6, 7, 7 }
5235     };
5236
5237     int index[64];
5238
5239     int av_unused last;
5240     int coeff_count = 0;
5241     int node_ctx = 0;
5242
5243     uint8_t *significant_coeff_ctx_base;
5244     uint8_t *last_coeff_ctx_base;
5245     uint8_t *abs_level_m1_ctx_base;
5246
5247 #if !ARCH_X86
5248 #define CABAC_ON_STACK
5249 #endif
5250 #ifdef CABAC_ON_STACK
5251 #define CC &cc
5252     CABACContext cc;
5253     cc.range     = h->cabac.range;
5254     cc.low       = h->cabac.low;
5255     cc.bytestream= h->cabac.bytestream;
5256 #else
5257 #define CC &h->cabac
5258 #endif
5259
5260
5261     /* cat: 0-> DC 16x16  n = 0
5262      *      1-> AC 16x16  n = luma4x4idx
5263      *      2-> Luma4x4   n = luma4x4idx
5264      *      3-> DC Chroma n = iCbCr
5265      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5266      *      5-> Luma8x8   n = 4 * luma8x8idx
5267      */
5268
5269     /* read coded block flag */
5270     if( is_dc || cat != 5 ) {
5271         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5272             if( !is_dc )
5273                 h->non_zero_count_cache[scan8[n]] = 0;
5274
5275 #ifdef CABAC_ON_STACK
5276             h->cabac.range     = cc.range     ;
5277             h->cabac.low       = cc.low       ;
5278             h->cabac.bytestream= cc.bytestream;
5279 #endif
5280             return;
5281         }
5282     }
5283
5284     significant_coeff_ctx_base = h->cabac_state
5285         + significant_coeff_flag_offset[MB_FIELD][cat];
5286     last_coeff_ctx_base = h->cabac_state
5287         + last_coeff_flag_offset[MB_FIELD][cat];
5288     abs_level_m1_ctx_base = h->cabac_state
5289         + coeff_abs_level_m1_offset[cat];
5290
5291     if( !is_dc && cat == 5 ) {
5292 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5293         for(last= 0; last < coefs; last++) { \
5294             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5295             if( get_cabac( CC, sig_ctx )) { \
5296                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5297                 index[coeff_count++] = last; \
5298                 if( get_cabac( CC, last_ctx ) ) { \
5299                     last= max_coeff; \
5300                     break; \
5301                 } \
5302             } \
5303         }\
5304         if( last == max_coeff -1 ) {\
5305             index[coeff_count++] = last;\
5306         }
5307         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5308 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5309         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5310     } else {
5311         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5312 #else
5313         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5314     } else {
5315         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5316 #endif
5317     }
5318     assert(coeff_count > 0);
5319
5320     if( is_dc ) {
5321         if( cat == 0 )
5322             h->cbp_table[h->mb_xy] |= 0x100;
5323         else
5324             h->cbp_table[h->mb_xy] |= 0x40 << n;
5325     } else {
5326         if( cat == 5 )
5327             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5328         else {
5329             assert( cat == 1 || cat == 2 || cat == 4 );
5330             h->non_zero_count_cache[scan8[n]] = coeff_count;
5331         }
5332     }
5333
5334     do {
5335         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5336
5337         int j= scantable[index[--coeff_count]];
5338
5339         if( get_cabac( CC, ctx ) == 0 ) {
5340             node_ctx = coeff_abs_level_transition[0][node_ctx];
5341             if( is_dc ) {
5342                 block[j] = get_cabac_bypass_sign( CC, -1);
5343             }else{
5344                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5345             }
5346         } else {
5347             int coeff_abs = 2;
5348             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5349             node_ctx = coeff_abs_level_transition[1][node_ctx];
5350
5351             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5352                 coeff_abs++;
5353             }
5354
5355             if( coeff_abs >= 15 ) {
5356                 int j = 0;
5357                 while( get_cabac_bypass( CC ) ) {
5358                     j++;
5359                 }
5360
5361                 coeff_abs=1;
5362                 while( j-- ) {
5363                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5364                 }
5365                 coeff_abs+= 14;
5366             }
5367
5368             if( is_dc ) {
5369                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5370             }else{
5371                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5372             }
5373         }
5374     } while( coeff_count );
5375 #ifdef CABAC_ON_STACK
5376             h->cabac.range     = cc.range     ;
5377             h->cabac.low       = cc.low       ;
5378             h->cabac.bytestream= cc.bytestream;
5379 #endif
5380
5381 }
5382
5383 #if !CONFIG_SMALL
5384 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5385     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5386 }
5387
5388 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5389     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5390 }
5391 #endif
5392
5393 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5394 #if CONFIG_SMALL
5395     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5396 #else
5397     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5398     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5399 #endif
5400 }
5401
5402 static inline void compute_mb_neighbors(H264Context *h)
5403 {
5404     MpegEncContext * const s = &h->s;
5405     const int mb_xy  = h->mb_xy;
5406     h->top_mb_xy     = mb_xy - s->mb_stride;
5407     h->left_mb_xy[0] = mb_xy - 1;
5408     if(FRAME_MBAFF){
5409         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5410         const int top_pair_xy      = pair_xy     - s->mb_stride;
5411         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5412         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5413         const int curr_mb_field_flag = MB_FIELD;
5414         const int bottom = (s->mb_y & 1);
5415
5416         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5417             h->top_mb_xy -= s->mb_stride;
5418         }
5419         if (!left_mb_field_flag == curr_mb_field_flag) {
5420             h->left_mb_xy[0] = pair_xy - 1;
5421         }
5422     } else if (FIELD_PICTURE) {
5423         h->top_mb_xy -= s->mb_stride;
5424     }
5425     return;
5426 }
5427
5428 /**
5429  * decodes a macroblock
5430  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5431  */
5432 static int decode_mb_cabac(H264Context *h) {
5433     MpegEncContext * const s = &h->s;
5434     int mb_xy;
5435     int mb_type, partition_count, cbp = 0;
5436     int dct8x8_allowed= h->pps.transform_8x8_mode;
5437
5438     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5439
5440     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5441     if( h->slice_type_nos != FF_I_TYPE ) {
5442         int skip;
5443         /* a skipped mb needs the aff flag from the following mb */
5444         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5445             predict_field_decoding_flag(h);
5446         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5447             skip = h->next_mb_skipped;
5448         else
5449             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5450         /* read skip flags */
5451         if( skip ) {
5452             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5453                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5454                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5455                 if(!h->next_mb_skipped)
5456                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5457             }
5458
5459             decode_mb_skip(h);
5460
5461             h->cbp_table[mb_xy] = 0;
5462             h->chroma_pred_mode_table[mb_xy] = 0;
5463             h->last_qscale_diff = 0;
5464
5465             return 0;
5466
5467         }
5468     }
5469     if(FRAME_MBAFF){
5470         if( (s->mb_y&1) == 0 )
5471             h->mb_mbaff =
5472             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5473     }
5474
5475     h->prev_mb_skipped = 0;
5476
5477     compute_mb_neighbors(h);
5478
5479     if( h->slice_type_nos == FF_B_TYPE ) {
5480         mb_type = decode_cabac_mb_type_b( h );
5481         if( mb_type < 23 ){
5482             partition_count= b_mb_type_info[mb_type].partition_count;
5483             mb_type=         b_mb_type_info[mb_type].type;
5484         }else{
5485             mb_type -= 23;
5486             goto decode_intra_mb;
5487         }
5488     } else if( h->slice_type_nos == FF_P_TYPE ) {
5489         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5490             /* P-type */
5491             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5492                 /* P_L0_D16x16, P_8x8 */
5493                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5494             } else {
5495                 /* P_L0_D8x16, P_L0_D16x8 */
5496                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5497             }
5498             partition_count= p_mb_type_info[mb_type].partition_count;
5499             mb_type=         p_mb_type_info[mb_type].type;
5500         } else {
5501             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5502             goto decode_intra_mb;
5503         }
5504     } else {
5505         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5506         if(h->slice_type == FF_SI_TYPE && mb_type)
5507             mb_type--;
5508         assert(h->slice_type_nos == FF_I_TYPE);
5509 decode_intra_mb:
5510         partition_count = 0;
5511         cbp= i_mb_type_info[mb_type].cbp;
5512         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5513         mb_type= i_mb_type_info[mb_type].type;
5514     }
5515     if(MB_FIELD)
5516         mb_type |= MB_TYPE_INTERLACED;
5517
5518     h->slice_table[ mb_xy ]= h->slice_num;
5519
5520     if(IS_INTRA_PCM(mb_type)) {
5521         const uint8_t *ptr;
5522
5523         // We assume these blocks are very rare so we do not optimize it.
5524         // FIXME The two following lines get the bitstream position in the cabac
5525         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5526         ptr= h->cabac.bytestream;
5527         if(h->cabac.low&0x1) ptr--;
5528         if(CABAC_BITS==16){
5529             if(h->cabac.low&0x1FF) ptr--;
5530         }
5531
5532         // The pixels are stored in the same order as levels in h->mb array.
5533         memcpy(h->mb, ptr, 256); ptr+=256;
5534         if(CHROMA){
5535             memcpy(h->mb+128, ptr, 128); ptr+=128;
5536         }
5537
5538         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5539
5540         // All blocks are present
5541         h->cbp_table[mb_xy] = 0x1ef;
5542         h->chroma_pred_mode_table[mb_xy] = 0;
5543         // In deblocking, the quantizer is 0
5544         s->current_picture.qscale_table[mb_xy]= 0;
5545         // All coeffs are present
5546         memset(h->non_zero_count[mb_xy], 16, 16);
5547         s->current_picture.mb_type[mb_xy]= mb_type;
5548         h->last_qscale_diff = 0;
5549         return 0;
5550     }
5551
5552     if(MB_MBAFF){
5553         h->ref_count[0] <<= 1;
5554         h->ref_count[1] <<= 1;
5555     }
5556
5557     fill_caches(h, mb_type, 0);
5558
5559     if( IS_INTRA( mb_type ) ) {
5560         int i, pred_mode;
5561         if( IS_INTRA4x4( mb_type ) ) {
5562             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5563                 mb_type |= MB_TYPE_8x8DCT;
5564                 for( i = 0; i < 16; i+=4 ) {
5565                     int pred = pred_intra_mode( h, i );
5566                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5567                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5568                 }
5569             } else {
5570                 for( i = 0; i < 16; i++ ) {
5571                     int pred = pred_intra_mode( h, i );
5572                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5573
5574                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5575                 }
5576             }
5577             write_back_intra_pred_mode(h);
5578             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5579         } else {
5580             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5581             if( h->intra16x16_pred_mode < 0 ) return -1;
5582         }
5583         if(CHROMA){
5584             h->chroma_pred_mode_table[mb_xy] =
5585             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5586
5587             pred_mode= check_intra_pred_mode( h, pred_mode );
5588             if( pred_mode < 0 ) return -1;
5589             h->chroma_pred_mode= pred_mode;
5590         }
5591     } else if( partition_count == 4 ) {
5592         int i, j, sub_partition_count[4], list, ref[2][4];
5593
5594         if( h->slice_type_nos == FF_B_TYPE ) {
5595             for( i = 0; i < 4; i++ ) {
5596                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5597                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5598                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5599             }
5600             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5601                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5602                 pred_direct_motion(h, &mb_type);
5603                 h->ref_cache[0][scan8[4]] =
5604                 h->ref_cache[1][scan8[4]] =
5605                 h->ref_cache[0][scan8[12]] =
5606                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5607                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5608                     for( i = 0; i < 4; i++ )
5609                         if( IS_DIRECT(h->sub_mb_type[i]) )
5610                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5611                 }
5612             }
5613         } else {
5614             for( i = 0; i < 4; i++ ) {
5615                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5616                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5617                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5618             }
5619         }
5620
5621         for( list = 0; list < h->list_count; list++ ) {
5622                 for( i = 0; i < 4; i++ ) {
5623                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5624                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5625                         if( h->ref_count[list] > 1 ){
5626                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5627                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5628                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5629                                 return -1;
5630                             }
5631                         }else
5632                             ref[list][i] = 0;
5633                     } else {
5634                         ref[list][i] = -1;
5635                     }
5636                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5637                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5638                 }
5639         }
5640
5641         if(dct8x8_allowed)
5642             dct8x8_allowed = get_dct8x8_allowed(h);
5643
5644         for(list=0; list<h->list_count; list++){
5645             for(i=0; i<4; i++){
5646                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5647                 if(IS_DIRECT(h->sub_mb_type[i])){
5648                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5649                     continue;
5650                 }
5651
5652                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5653                     const int sub_mb_type= h->sub_mb_type[i];
5654                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5655                     for(j=0; j<sub_partition_count[i]; j++){
5656                         int mpx, mpy;
5657                         int mx, my;
5658                         const int index= 4*i + block_width*j;
5659                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5660                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5661                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5662
5663                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5664                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5665                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5666
5667                         if(IS_SUB_8X8(sub_mb_type)){
5668                             mv_cache[ 1 ][0]=
5669                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5670                             mv_cache[ 1 ][1]=
5671                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5672
5673                             mvd_cache[ 1 ][0]=
5674                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5675                             mvd_cache[ 1 ][1]=
5676                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5677                         }else if(IS_SUB_8X4(sub_mb_type)){
5678                             mv_cache[ 1 ][0]= mx;
5679                             mv_cache[ 1 ][1]= my;
5680
5681                             mvd_cache[ 1 ][0]= mx - mpx;
5682                             mvd_cache[ 1 ][1]= my - mpy;
5683                         }else if(IS_SUB_4X8(sub_mb_type)){
5684                             mv_cache[ 8 ][0]= mx;
5685                             mv_cache[ 8 ][1]= my;
5686
5687                             mvd_cache[ 8 ][0]= mx - mpx;
5688                             mvd_cache[ 8 ][1]= my - mpy;
5689                         }
5690                         mv_cache[ 0 ][0]= mx;
5691                         mv_cache[ 0 ][1]= my;
5692
5693                         mvd_cache[ 0 ][0]= mx - mpx;
5694                         mvd_cache[ 0 ][1]= my - mpy;
5695                     }
5696                 }else{
5697                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5698                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5699                     p[0] = p[1] = p[8] = p[9] = 0;
5700                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5701                 }
5702             }
5703         }
5704     } else if( IS_DIRECT(mb_type) ) {
5705         pred_direct_motion(h, &mb_type);
5706         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5707         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5708         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5709     } else {
5710         int list, mx, my, i, mpx, mpy;
5711         if(IS_16X16(mb_type)){
5712             for(list=0; list<h->list_count; list++){
5713                 if(IS_DIR(mb_type, 0, list)){
5714                     int ref;
5715                     if(h->ref_count[list] > 1){
5716                         ref= decode_cabac_mb_ref(h, list, 0);
5717                         if(ref >= (unsigned)h->ref_count[list]){
5718                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5719                             return -1;
5720                         }
5721                     }else
5722                         ref=0;
5723                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5724                 }else
5725                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5726             }
5727             for(list=0; list<h->list_count; list++){
5728                 if(IS_DIR(mb_type, 0, list)){
5729                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5730
5731                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5732                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5733                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5734
5735                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5736                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5737                 }else
5738                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5739             }
5740         }
5741         else if(IS_16X8(mb_type)){
5742             for(list=0; list<h->list_count; list++){
5743                     for(i=0; i<2; i++){
5744                         if(IS_DIR(mb_type, i, list)){
5745                             int ref;
5746                             if(h->ref_count[list] > 1){
5747                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5748                                 if(ref >= (unsigned)h->ref_count[list]){
5749                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5750                                     return -1;
5751                                 }
5752                             }else
5753                                 ref=0;
5754                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5755                         }else
5756                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5757                     }
5758             }
5759             for(list=0; list<h->list_count; list++){
5760                 for(i=0; i<2; i++){
5761                     if(IS_DIR(mb_type, i, list)){
5762                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5763                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5764                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5765                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5766
5767                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5768                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5769                     }else{
5770                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5771                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5772                     }
5773                 }
5774             }
5775         }else{
5776             assert(IS_8X16(mb_type));
5777             for(list=0; list<h->list_count; list++){
5778                     for(i=0; i<2; i++){
5779                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5780                             int ref;
5781                             if(h->ref_count[list] > 1){
5782                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5783                                 if(ref >= (unsigned)h->ref_count[list]){
5784                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5785                                     return -1;
5786                                 }
5787                             }else
5788                                 ref=0;
5789                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5790                         }else
5791                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5792                     }
5793             }
5794             for(list=0; list<h->list_count; list++){
5795                 for(i=0; i<2; i++){
5796                     if(IS_DIR(mb_type, i, list)){
5797                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5798                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5799                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5800
5801                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5802                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5803                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5804                     }else{
5805                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5806                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5807                     }
5808                 }
5809             }
5810         }
5811     }
5812
5813    if( IS_INTER( mb_type ) ) {
5814         h->chroma_pred_mode_table[mb_xy] = 0;
5815         write_back_motion( h, mb_type );
5816    }
5817
5818     if( !IS_INTRA16x16( mb_type ) ) {
5819         cbp  = decode_cabac_mb_cbp_luma( h );
5820         if(CHROMA)
5821             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5822     }
5823
5824     h->cbp_table[mb_xy] = h->cbp = cbp;
5825
5826     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5827         if( decode_cabac_mb_transform_size( h ) )
5828             mb_type |= MB_TYPE_8x8DCT;
5829     }
5830     s->current_picture.mb_type[mb_xy]= mb_type;
5831
5832     if( cbp || IS_INTRA16x16( mb_type ) ) {
5833         const uint8_t *scan, *scan8x8, *dc_scan;
5834         const uint32_t *qmul;
5835         int dqp;
5836
5837         if(IS_INTERLACED(mb_type)){
5838             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5839             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5840             dc_scan= luma_dc_field_scan;
5841         }else{
5842             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5843             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5844             dc_scan= luma_dc_zigzag_scan;
5845         }
5846
5847         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5848         if( dqp == INT_MIN ){
5849             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5850             return -1;
5851         }
5852         s->qscale += dqp;
5853         if(((unsigned)s->qscale) > 51){
5854             if(s->qscale<0) s->qscale+= 52;
5855             else            s->qscale-= 52;
5856         }
5857         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5858         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5859
5860         if( IS_INTRA16x16( mb_type ) ) {
5861             int i;
5862             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5863             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5864
5865             if( cbp&15 ) {
5866                 qmul = h->dequant4_coeff[0][s->qscale];
5867                 for( i = 0; i < 16; i++ ) {
5868                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5869                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5870                 }
5871             } else {
5872                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5873             }
5874         } else {
5875             int i8x8, i4x4;
5876             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5877                 if( cbp & (1<<i8x8) ) {
5878                     if( IS_8x8DCT(mb_type) ) {
5879                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5880                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5881                     } else {
5882                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5883                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5884                             const int index = 4*i8x8 + i4x4;
5885                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5886 //START_TIMER
5887                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5888 //STOP_TIMER("decode_residual")
5889                         }
5890                     }
5891                 } else {
5892                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5893                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5894                 }
5895             }
5896         }
5897
5898         if( cbp&0x30 ){
5899             int c;
5900             for( c = 0; c < 2; c++ ) {
5901                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5902                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5903             }
5904         }
5905
5906         if( cbp&0x20 ) {
5907             int c, i;
5908             for( c = 0; c < 2; c++ ) {
5909                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5910                 for( i = 0; i < 4; i++ ) {
5911                     const int index = 16 + 4 * c + i;
5912                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5913                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5914                 }
5915             }
5916         } else {
5917             uint8_t * const nnz= &h->non_zero_count_cache[0];
5918             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5919             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5920         }
5921     } else {
5922         uint8_t * const nnz= &h->non_zero_count_cache[0];
5923         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5924         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5925         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5926         h->last_qscale_diff = 0;
5927     }
5928
5929     s->current_picture.qscale_table[mb_xy]= s->qscale;
5930     write_back_non_zero_count(h);
5931
5932     if(MB_MBAFF){
5933         h->ref_count[0] >>= 1;
5934         h->ref_count[1] >>= 1;
5935     }
5936
5937     return 0;
5938 }
5939
5940
5941 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5942     const int index_a = qp + h->slice_alpha_c0_offset;
5943     const int alpha = (alpha_table+52)[index_a];
5944     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5945
5946     if( bS[0] < 4 ) {
5947         int8_t tc[4];
5948         tc[0] = (tc0_table+52)[index_a][bS[0]];
5949         tc[1] = (tc0_table+52)[index_a][bS[1]];
5950         tc[2] = (tc0_table+52)[index_a][bS[2]];
5951         tc[3] = (tc0_table+52)[index_a][bS[3]];
5952         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5953     } else {
5954         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5955     }
5956 }
5957 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5958     const int index_a = qp + h->slice_alpha_c0_offset;
5959     const int alpha = (alpha_table+52)[index_a];
5960     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5961
5962     if( bS[0] < 4 ) {
5963         int8_t tc[4];
5964         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5965         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5966         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5967         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5968         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5969     } else {
5970         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5971     }
5972 }
5973
5974 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5975     int i;
5976     for( i = 0; i < 16; i++, pix += stride) {
5977         int index_a;
5978         int alpha;
5979         int beta;
5980
5981         int qp_index;
5982         int bS_index = (i >> 1);
5983         if (!MB_FIELD) {
5984             bS_index &= ~1;
5985             bS_index |= (i & 1);
5986         }
5987
5988         if( bS[bS_index] == 0 ) {
5989             continue;
5990         }
5991
5992         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5993         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5994         alpha = (alpha_table+52)[index_a];
5995         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5996
5997         if( bS[bS_index] < 4 ) {
5998             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5999             const int p0 = pix[-1];
6000             const int p1 = pix[-2];
6001             const int p2 = pix[-3];
6002             const int q0 = pix[0];
6003             const int q1 = pix[1];
6004             const int q2 = pix[2];
6005
6006             if( FFABS( p0 - q0 ) < alpha &&
6007                 FFABS( p1 - p0 ) < beta &&
6008                 FFABS( q1 - q0 ) < beta ) {
6009                 int tc = tc0;
6010                 int i_delta;
6011
6012                 if( FFABS( p2 - p0 ) < beta ) {
6013                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6014                     tc++;
6015                 }
6016                 if( FFABS( q2 - q0 ) < beta ) {
6017                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6018                     tc++;
6019                 }
6020
6021                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6022                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6023                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6024                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6025             }
6026         }else{
6027             const int p0 = pix[-1];
6028             const int p1 = pix[-2];
6029             const int p2 = pix[-3];
6030
6031             const int q0 = pix[0];
6032             const int q1 = pix[1];
6033             const int q2 = pix[2];
6034
6035             if( FFABS( p0 - q0 ) < alpha &&
6036                 FFABS( p1 - p0 ) < beta &&
6037                 FFABS( q1 - q0 ) < beta ) {
6038
6039                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6040                     if( FFABS( p2 - p0 ) < beta)
6041                     {
6042                         const int p3 = pix[-4];
6043                         /* p0', p1', p2' */
6044                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6045                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6046                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6047                     } else {
6048                         /* p0' */
6049                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6050                     }
6051                     if( FFABS( q2 - q0 ) < beta)
6052                     {
6053                         const int q3 = pix[3];
6054                         /* q0', q1', q2' */
6055                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6056                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6057                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6058                     } else {
6059                         /* q0' */
6060                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6061                     }
6062                 }else{
6063                     /* p0', q0' */
6064                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6065                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6066                 }
6067                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6068             }
6069         }
6070     }
6071 }
6072 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6073     int i;
6074     for( i = 0; i < 8; i++, pix += stride) {
6075         int index_a;
6076         int alpha;
6077         int beta;
6078
6079         int qp_index;
6080         int bS_index = i;
6081
6082         if( bS[bS_index] == 0 ) {
6083             continue;
6084         }
6085
6086         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6087         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6088         alpha = (alpha_table+52)[index_a];
6089         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6090
6091         if( bS[bS_index] < 4 ) {
6092             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6093             const int p0 = pix[-1];
6094             const int p1 = pix[-2];
6095             const int q0 = pix[0];
6096             const int q1 = pix[1];
6097
6098             if( FFABS( p0 - q0 ) < alpha &&
6099                 FFABS( p1 - p0 ) < beta &&
6100                 FFABS( q1 - q0 ) < beta ) {
6101                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6102
6103                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6104                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6105                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6106             }
6107         }else{
6108             const int p0 = pix[-1];
6109             const int p1 = pix[-2];
6110             const int q0 = pix[0];
6111             const int q1 = pix[1];
6112
6113             if( FFABS( p0 - q0 ) < alpha &&
6114                 FFABS( p1 - p0 ) < beta &&
6115                 FFABS( q1 - q0 ) < beta ) {
6116
6117                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6118                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6119                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6120             }
6121         }
6122     }
6123 }
6124
6125 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6126     const int index_a = qp + h->slice_alpha_c0_offset;
6127     const int alpha = (alpha_table+52)[index_a];
6128     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6129
6130     if( bS[0] < 4 ) {
6131         int8_t tc[4];
6132         tc[0] = (tc0_table+52)[index_a][bS[0]];
6133         tc[1] = (tc0_table+52)[index_a][bS[1]];
6134         tc[2] = (tc0_table+52)[index_a][bS[2]];
6135         tc[3] = (tc0_table+52)[index_a][bS[3]];
6136         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6137     } else {
6138         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6139     }
6140 }
6141
6142 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6143     const int index_a = qp + h->slice_alpha_c0_offset;
6144     const int alpha = (alpha_table+52)[index_a];
6145     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6146
6147     if( bS[0] < 4 ) {
6148         int8_t tc[4];
6149         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6150         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6151         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6152         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6153         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6154     } else {
6155         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6156     }
6157 }
6158
6159 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6160     MpegEncContext * const s = &h->s;
6161     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6162     int mb_xy, mb_type;
6163     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6164
6165     mb_xy = h->mb_xy;
6166
6167     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6168         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6169        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6170                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6171         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6172         return;
6173     }
6174     assert(!FRAME_MBAFF);
6175
6176     mb_type = s->current_picture.mb_type[mb_xy];
6177     qp = s->current_picture.qscale_table[mb_xy];
6178     qp0 = s->current_picture.qscale_table[mb_xy-1];
6179     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6180     qpc = get_chroma_qp( h, 0, qp );
6181     qpc0 = get_chroma_qp( h, 0, qp0 );
6182     qpc1 = get_chroma_qp( h, 0, qp1 );
6183     qp0 = (qp + qp0 + 1) >> 1;
6184     qp1 = (qp + qp1 + 1) >> 1;
6185     qpc0 = (qpc + qpc0 + 1) >> 1;
6186     qpc1 = (qpc + qpc1 + 1) >> 1;
6187     qp_thresh = 15 - h->slice_alpha_c0_offset;
6188     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6189        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6190         return;
6191
6192     if( IS_INTRA(mb_type) ) {
6193         int16_t bS4[4] = {4,4,4,4};
6194         int16_t bS3[4] = {3,3,3,3};
6195         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6196         if( IS_8x8DCT(mb_type) ) {
6197             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6198             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6199             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6200             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6201         } else {
6202             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6203             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6204             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6205             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6206             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6207             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6208             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6209             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6210         }
6211         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6212         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6213         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6214         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6215         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6216         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6217         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6218         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6219         return;
6220     } else {
6221         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6222         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6223         int edges;
6224         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6225             edges = 4;
6226             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6227         } else {
6228             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6229                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6230             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6231                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6232                              ? 3 : 0;
6233             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6234             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6235             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6236                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6237         }
6238         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6239             bSv[0][0] = 0x0004000400040004ULL;
6240         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6241             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6242
6243 #define FILTER(hv,dir,edge)\
6244         if(bSv[dir][edge]) {\
6245             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6246             if(!(edge&1)) {\
6247                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6248                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6249             }\
6250         }
6251         if( edges == 1 ) {
6252             FILTER(v,0,0);
6253             FILTER(h,1,0);
6254         } else if( IS_8x8DCT(mb_type) ) {
6255             FILTER(v,0,0);
6256             FILTER(v,0,2);
6257             FILTER(h,1,0);
6258             FILTER(h,1,2);
6259         } else {
6260             FILTER(v,0,0);
6261             FILTER(v,0,1);
6262             FILTER(v,0,2);
6263             FILTER(v,0,3);
6264             FILTER(h,1,0);
6265             FILTER(h,1,1);
6266             FILTER(h,1,2);
6267             FILTER(h,1,3);
6268         }
6269 #undef FILTER
6270     }
6271 }
6272
6273
6274 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6275     MpegEncContext * const s = &h->s;
6276     int edge;
6277     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6278     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6279     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6280     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6281     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6282
6283     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6284                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6285     // how often to recheck mv-based bS when iterating between edges
6286     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6287                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6288     // how often to recheck mv-based bS when iterating along each edge
6289     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6290
6291     if (first_vertical_edge_done) {
6292         start = 1;
6293     }
6294
6295     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6296         start = 1;
6297
6298     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6299         && !IS_INTERLACED(mb_type)
6300         && IS_INTERLACED(mbm_type)
6301         ) {
6302         // This is a special case in the norm where the filtering must
6303         // be done twice (one each of the field) even if we are in a
6304         // frame macroblock.
6305         //
6306         static const int nnz_idx[4] = {4,5,6,3};
6307         unsigned int tmp_linesize   = 2 *   linesize;
6308         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6309         int mbn_xy = mb_xy - 2 * s->mb_stride;
6310         int qp;
6311         int i, j;
6312         int16_t bS[4];
6313
6314         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6315             if( IS_INTRA(mb_type) ||
6316                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6317                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6318             } else {
6319                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6320                 for( i = 0; i < 4; i++ ) {
6321                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6322                         mbn_nnz[nnz_idx[i]] != 0 )
6323                         bS[i] = 2;
6324                     else
6325                         bS[i] = 1;
6326                 }
6327             }
6328             // Do not use s->qscale as luma quantizer because it has not the same
6329             // value in IPCM macroblocks.
6330             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6331             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6332             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6333             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6334             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6335                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6336             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6337                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6338         }
6339
6340         start = 1;
6341     }
6342
6343     /* Calculate bS */
6344     for( edge = start; edge < edges; edge++ ) {
6345         /* mbn_xy: neighbor macroblock */
6346         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6347         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6348         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6349         int16_t bS[4];
6350         int qp;
6351
6352         if( (edge&1) && IS_8x8DCT(mb_type) )
6353             continue;
6354
6355         if( IS_INTRA(mb_type) ||
6356             IS_INTRA(mbn_type) ) {
6357             int value;
6358             if (edge == 0) {
6359                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6360                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6361                 ) {
6362                     value = 4;
6363                 } else {
6364                     value = 3;
6365                 }
6366             } else {
6367                 value = 3;
6368             }
6369             bS[0] = bS[1] = bS[2] = bS[3] = value;
6370         } else {
6371             int i, l;
6372             int mv_done;
6373
6374             if( edge & mask_edge ) {
6375                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6376                 mv_done = 1;
6377             }
6378             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6379                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6380                 mv_done = 1;
6381             }
6382             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6383                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6384                 int bn_idx= b_idx - (dir ? 8:1);
6385                 int v = 0;
6386
6387                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6388                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6389                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6390                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6391                 }
6392
6393                 if(h->slice_type_nos == FF_B_TYPE && v){
6394                     v=0;
6395                     for( l = 0; !v && l < 2; l++ ) {
6396                         int ln= 1-l;
6397                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6398                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6399                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6400                     }
6401                 }
6402
6403                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6404                 mv_done = 1;
6405             }
6406             else
6407                 mv_done = 0;
6408
6409             for( i = 0; i < 4; i++ ) {
6410                 int x = dir == 0 ? edge : i;
6411                 int y = dir == 0 ? i    : edge;
6412                 int b_idx= 8 + 4 + x + 8*y;
6413                 int bn_idx= b_idx - (dir ? 8:1);
6414
6415                 if( h->non_zero_count_cache[b_idx] |
6416                     h->non_zero_count_cache[bn_idx] ) {
6417                     bS[i] = 2;
6418                 }
6419                 else if(!mv_done)
6420                 {
6421                     bS[i] = 0;
6422                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6423                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6424                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6425                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6426                             bS[i] = 1;
6427                             break;
6428                         }
6429                     }
6430
6431                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6432                         bS[i] = 0;
6433                         for( l = 0; l < 2; l++ ) {
6434                             int ln= 1-l;
6435                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6436                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6437                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6438                                 bS[i] = 1;
6439                                 break;
6440                             }
6441                         }
6442                     }
6443                 }
6444             }
6445
6446             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6447                 continue;
6448         }
6449
6450         /* Filter edge */
6451         // Do not use s->qscale as luma quantizer because it has not the same
6452         // value in IPCM macroblocks.
6453         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6454         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6455         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6456         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6457         if( dir == 0 ) {
6458             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6459             if( (edge&1) == 0 ) {
6460                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6461                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6462                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6463                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6464             }
6465         } else {
6466             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6467             if( (edge&1) == 0 ) {
6468                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6469                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6470                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6471                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6472             }
6473         }
6474     }
6475 }
6476
6477 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6478     MpegEncContext * const s = &h->s;
6479     const int mb_xy= mb_x + mb_y*s->mb_stride;
6480     const int mb_type = s->current_picture.mb_type[mb_xy];
6481     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6482     int first_vertical_edge_done = 0;
6483     av_unused int dir;
6484
6485     //for sufficiently low qp, filtering wouldn't do anything
6486     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6487     if(!FRAME_MBAFF){
6488         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6489         int qp = s->current_picture.qscale_table[mb_xy];
6490         if(qp <= qp_thresh
6491            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6492            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6493             return;
6494         }
6495     }
6496
6497     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6498     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6499         int top_type, left_type[2];
6500         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6501         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6502         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6503
6504         if(IS_8x8DCT(top_type)){
6505             h->non_zero_count_cache[4+8*0]=
6506             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6507             h->non_zero_count_cache[6+8*0]=
6508             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6509         }
6510         if(IS_8x8DCT(left_type[0])){
6511             h->non_zero_count_cache[3+8*1]=
6512             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6513         }
6514         if(IS_8x8DCT(left_type[1])){
6515             h->non_zero_count_cache[3+8*3]=
6516             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6517         }
6518
6519         if(IS_8x8DCT(mb_type)){
6520             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6521             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6522
6523             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6524             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6525
6526             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6527             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6528
6529             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6530             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6531         }
6532     }
6533
6534     if (FRAME_MBAFF
6535             // left mb is in picture
6536             && h->slice_table[mb_xy-1] != 0xFFFF
6537             // and current and left pair do not have the same interlaced type
6538             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6539             // and left mb is in the same slice if deblocking_filter == 2
6540             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6541         /* First vertical edge is different in MBAFF frames
6542          * There are 8 different bS to compute and 2 different Qp
6543          */
6544         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6545         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6546         int16_t bS[8];
6547         int qp[2];
6548         int bqp[2];
6549         int rqp[2];
6550         int mb_qp, mbn0_qp, mbn1_qp;
6551         int i;
6552         first_vertical_edge_done = 1;
6553
6554         if( IS_INTRA(mb_type) )
6555             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6556         else {
6557             for( i = 0; i < 8; i++ ) {
6558                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6559
6560                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6561                     bS[i] = 4;
6562                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6563                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6564                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6565                                                                        :
6566                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6567                     bS[i] = 2;
6568                 else
6569                     bS[i] = 1;
6570             }
6571         }
6572
6573         mb_qp = s->current_picture.qscale_table[mb_xy];
6574         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6575         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6576         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6577         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6578                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6579         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6580                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6581         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6582         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6583                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6584         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6585                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6586
6587         /* Filter edge */
6588         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6589         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6590         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6591         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6592         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6593     }
6594
6595 #if CONFIG_SMALL
6596     for( dir = 0; dir < 2; dir++ )
6597         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6598 #else
6599     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6600     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6601 #endif
6602 }
6603
6604 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6605     H264Context *h = *(void**)arg;
6606     MpegEncContext * const s = &h->s;
6607     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6608
6609     s->mb_skip_run= -1;
6610
6611     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6612                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6613
6614     if( h->pps.cabac ) {
6615         int i;
6616
6617         /* realign */
6618         align_get_bits( &s->gb );
6619
6620         /* init cabac */
6621         ff_init_cabac_states( &h->cabac);
6622         ff_init_cabac_decoder( &h->cabac,
6623                                s->gb.buffer + get_bits_count(&s->gb)/8,
6624                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6625         /* calculate pre-state */
6626         for( i= 0; i < 460; i++ ) {
6627             int pre;
6628             if( h->slice_type_nos == FF_I_TYPE )
6629                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6630             else
6631                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6632
6633             if( pre <= 63 )
6634                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6635             else
6636                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6637         }
6638
6639         for(;;){
6640 //START_TIMER
6641             int ret = decode_mb_cabac(h);
6642             int eos;
6643 //STOP_TIMER("decode_mb_cabac")
6644
6645             if(ret>=0) hl_decode_mb(h);
6646
6647             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6648                 s->mb_y++;
6649
6650                 ret = decode_mb_cabac(h);
6651
6652                 if(ret>=0) hl_decode_mb(h);
6653                 s->mb_y--;
6654             }
6655             eos = get_cabac_terminate( &h->cabac );
6656
6657             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6658                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6659                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6660                 return -1;
6661             }
6662
6663             if( ++s->mb_x >= s->mb_width ) {
6664                 s->mb_x = 0;
6665                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6666                 ++s->mb_y;
6667                 if(FIELD_OR_MBAFF_PICTURE) {
6668                     ++s->mb_y;
6669                 }
6670             }
6671
6672             if( eos || s->mb_y >= s->mb_height ) {
6673                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6674                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6675                 return 0;
6676             }
6677         }
6678
6679     } else {
6680         for(;;){
6681             int ret = decode_mb_cavlc(h);
6682
6683             if(ret>=0) hl_decode_mb(h);
6684
6685             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6686                 s->mb_y++;
6687                 ret = decode_mb_cavlc(h);
6688
6689                 if(ret>=0) hl_decode_mb(h);
6690                 s->mb_y--;
6691             }
6692
6693             if(ret<0){
6694                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6695                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6696
6697                 return -1;
6698             }
6699
6700             if(++s->mb_x >= s->mb_width){
6701                 s->mb_x=0;
6702                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6703                 ++s->mb_y;
6704                 if(FIELD_OR_MBAFF_PICTURE) {
6705                     ++s->mb_y;
6706                 }
6707                 if(s->mb_y >= s->mb_height){
6708                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6709
6710                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6711                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6712
6713                         return 0;
6714                     }else{
6715                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6716
6717                         return -1;
6718                     }
6719                 }
6720             }
6721
6722             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6723                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6724                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6725                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6726
6727                     return 0;
6728                 }else{
6729                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6730
6731                     return -1;
6732                 }
6733             }
6734         }
6735     }
6736
6737 #if 0
6738     for(;s->mb_y < s->mb_height; s->mb_y++){
6739         for(;s->mb_x < s->mb_width; s->mb_x++){
6740             int ret= decode_mb(h);
6741
6742             hl_decode_mb(h);
6743
6744             if(ret<0){
6745                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6746                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6747
6748                 return -1;
6749             }
6750
6751             if(++s->mb_x >= s->mb_width){
6752                 s->mb_x=0;
6753                 if(++s->mb_y >= s->mb_height){
6754                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6755                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6756
6757                         return 0;
6758                     }else{
6759                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6760
6761                         return -1;
6762                     }
6763                 }
6764             }
6765
6766             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6767                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6768                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6769
6770                     return 0;
6771                 }else{
6772                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6773
6774                     return -1;
6775                 }
6776             }
6777         }
6778         s->mb_x=0;
6779         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6780     }
6781 #endif
6782     return -1; //not reached
6783 }
6784
6785 static int decode_picture_timing(H264Context *h){
6786     MpegEncContext * const s = &h->s;
6787     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6788         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6789         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6790     }
6791     if(h->sps.pic_struct_present_flag){
6792         unsigned int i, num_clock_ts;
6793         h->sei_pic_struct = get_bits(&s->gb, 4);
6794
6795         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6796             return -1;
6797
6798         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6799
6800         for (i = 0 ; i < num_clock_ts ; i++){
6801             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6802                 unsigned int full_timestamp_flag;
6803                 skip_bits(&s->gb, 2);                 /* ct_type */
6804                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6805                 skip_bits(&s->gb, 5);                 /* counting_type */
6806                 full_timestamp_flag = get_bits(&s->gb, 1);
6807                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6808                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6809                 skip_bits(&s->gb, 8);                 /* n_frames */
6810                 if(full_timestamp_flag){
6811                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6812                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6813                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6814                 }else{
6815                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6816                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6817                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6818                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6819                             if(get_bits(&s->gb, 1))   /* hours_flag */
6820                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6821                         }
6822                     }
6823                 }
6824                 if(h->sps.time_offset_length > 0)
6825                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6826             }
6827         }
6828     }
6829     return 0;
6830 }
6831
6832 static int decode_unregistered_user_data(H264Context *h, int size){
6833     MpegEncContext * const s = &h->s;
6834     uint8_t user_data[16+256];
6835     int e, build, i;
6836
6837     if(size<16)
6838         return -1;
6839
6840     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6841         user_data[i]= get_bits(&s->gb, 8);
6842     }
6843
6844     user_data[i]= 0;
6845     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6846     if(e==1 && build>=0)
6847         h->x264_build= build;
6848
6849     if(s->avctx->debug & FF_DEBUG_BUGS)
6850         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6851
6852     for(; i<size; i++)
6853         skip_bits(&s->gb, 8);
6854
6855     return 0;
6856 }
6857
6858 static int decode_recovery_point(H264Context *h){
6859     MpegEncContext * const s = &h->s;
6860
6861     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6862     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6863
6864     return 0;
6865 }
6866
6867 static int decode_sei(H264Context *h){
6868     MpegEncContext * const s = &h->s;
6869
6870     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6871         int size, type;
6872
6873         type=0;
6874         do{
6875             type+= show_bits(&s->gb, 8);
6876         }while(get_bits(&s->gb, 8) == 255);
6877
6878         size=0;
6879         do{
6880             size+= show_bits(&s->gb, 8);
6881         }while(get_bits(&s->gb, 8) == 255);
6882
6883         switch(type){
6884         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6885             if(decode_picture_timing(h) < 0)
6886                 return -1;
6887             break;
6888         case SEI_TYPE_USER_DATA_UNREGISTERED:
6889             if(decode_unregistered_user_data(h, size) < 0)
6890                 return -1;
6891             break;
6892         case SEI_TYPE_RECOVERY_POINT:
6893             if(decode_recovery_point(h) < 0)
6894                 return -1;
6895             break;
6896         default:
6897             skip_bits(&s->gb, 8*size);
6898         }
6899
6900         //FIXME check bits here
6901         align_get_bits(&s->gb);
6902     }
6903
6904     return 0;
6905 }
6906
6907 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6908     MpegEncContext * const s = &h->s;
6909     int cpb_count, i;
6910     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6911
6912     if(cpb_count > 32U){
6913         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6914         return -1;
6915     }
6916
6917     get_bits(&s->gb, 4); /* bit_rate_scale */
6918     get_bits(&s->gb, 4); /* cpb_size_scale */
6919     for(i=0; i<cpb_count; i++){
6920         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6921         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6922         get_bits1(&s->gb);     /* cbr_flag */
6923     }
6924     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6925     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6926     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6927     sps->time_offset_length = get_bits(&s->gb, 5);
6928     return 0;
6929 }
6930
6931 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6932     MpegEncContext * const s = &h->s;
6933     int aspect_ratio_info_present_flag;
6934     unsigned int aspect_ratio_idc;
6935
6936     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6937
6938     if( aspect_ratio_info_present_flag ) {
6939         aspect_ratio_idc= get_bits(&s->gb, 8);
6940         if( aspect_ratio_idc == EXTENDED_SAR ) {
6941             sps->sar.num= get_bits(&s->gb, 16);
6942             sps->sar.den= get_bits(&s->gb, 16);
6943         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6944             sps->sar=  pixel_aspect[aspect_ratio_idc];
6945         }else{
6946             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6947             return -1;
6948         }
6949     }else{
6950         sps->sar.num=
6951         sps->sar.den= 0;
6952     }
6953 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6954
6955     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6956         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6957     }
6958
6959     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6960         get_bits(&s->gb, 3);    /* video_format */
6961         get_bits1(&s->gb);      /* video_full_range_flag */
6962         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6963             get_bits(&s->gb, 8); /* colour_primaries */
6964             get_bits(&s->gb, 8); /* transfer_characteristics */
6965             get_bits(&s->gb, 8); /* matrix_coefficients */
6966         }
6967     }
6968
6969     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6970         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6971         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6972     }
6973
6974     sps->timing_info_present_flag = get_bits1(&s->gb);
6975     if(sps->timing_info_present_flag){
6976         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6977         sps->time_scale = get_bits_long(&s->gb, 32);
6978         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6979     }
6980
6981     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6982     if(sps->nal_hrd_parameters_present_flag)
6983         if(decode_hrd_parameters(h, sps) < 0)
6984             return -1;
6985     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6986     if(sps->vcl_hrd_parameters_present_flag)
6987         if(decode_hrd_parameters(h, sps) < 0)
6988             return -1;
6989     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6990         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6991     sps->pic_struct_present_flag = get_bits1(&s->gb);
6992
6993     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6994     if(sps->bitstream_restriction_flag){
6995         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6996         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6997         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6998         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6999         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7000         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7001         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7002
7003         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7004             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7005             return -1;
7006         }
7007     }
7008
7009     return 0;
7010 }
7011
7012 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7013                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7014     MpegEncContext * const s = &h->s;
7015     int i, last = 8, next = 8;
7016     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7017     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7018         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7019     else
7020     for(i=0;i<size;i++){
7021         if(next)
7022             next = (last + get_se_golomb(&s->gb)) & 0xff;
7023         if(!i && !next){ /* matrix not written, we use the preset one */
7024             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7025             break;
7026         }
7027         last = factors[scan[i]] = next ? next : last;
7028     }
7029 }
7030
7031 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7032                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7033     MpegEncContext * const s = &h->s;
7034     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7035     const uint8_t *fallback[4] = {
7036         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7037         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7038         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7039         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7040     };
7041     if(get_bits1(&s->gb)){
7042         sps->scaling_matrix_present |= is_sps;
7043         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7044         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7045         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7046         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7047         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7048         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7049         if(is_sps || pps->transform_8x8_mode){
7050             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7051             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7052         }
7053     }
7054 }
7055
7056 static inline int decode_seq_parameter_set(H264Context *h){
7057     MpegEncContext * const s = &h->s;
7058     int profile_idc, level_idc;
7059     unsigned int sps_id;
7060     int i;
7061     SPS *sps;
7062
7063     profile_idc= get_bits(&s->gb, 8);
7064     get_bits1(&s->gb);   //constraint_set0_flag
7065     get_bits1(&s->gb);   //constraint_set1_flag
7066     get_bits1(&s->gb);   //constraint_set2_flag
7067     get_bits1(&s->gb);   //constraint_set3_flag
7068     get_bits(&s->gb, 4); // reserved
7069     level_idc= get_bits(&s->gb, 8);
7070     sps_id= get_ue_golomb_31(&s->gb);
7071
7072     if(sps_id >= MAX_SPS_COUNT) {
7073         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7074         return -1;
7075     }
7076     sps= av_mallocz(sizeof(SPS));
7077     if(sps == NULL)
7078         return -1;
7079
7080     sps->profile_idc= profile_idc;
7081     sps->level_idc= level_idc;
7082
7083     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7084     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7085     sps->scaling_matrix_present = 0;
7086
7087     if(sps->profile_idc >= 100){ //high profile
7088         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7089         if(sps->chroma_format_idc == 3)
7090             sps->residual_color_transform_flag = get_bits1(&s->gb);
7091         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7092         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7093         sps->transform_bypass = get_bits1(&s->gb);
7094         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7095     }else{
7096         sps->chroma_format_idc= 1;
7097     }
7098
7099     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7100     sps->poc_type= get_ue_golomb_31(&s->gb);
7101
7102     if(sps->poc_type == 0){ //FIXME #define
7103         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7104     } else if(sps->poc_type == 1){//FIXME #define
7105         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7106         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7107         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7108         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7109
7110         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7111             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7112             goto fail;
7113         }
7114
7115         for(i=0; i<sps->poc_cycle_length; i++)
7116             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7117     }else if(sps->poc_type != 2){
7118         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7119         goto fail;
7120     }
7121
7122     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7123     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7124         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7125         goto fail;
7126     }
7127     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7128     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7129     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7130     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7131        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7132         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7133         goto fail;
7134     }
7135
7136     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7137     if(!sps->frame_mbs_only_flag)
7138         sps->mb_aff= get_bits1(&s->gb);
7139     else
7140         sps->mb_aff= 0;
7141
7142     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7143
7144 #ifndef ALLOW_INTERLACE
7145     if(sps->mb_aff)
7146         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7147 #endif
7148     sps->crop= get_bits1(&s->gb);
7149     if(sps->crop){
7150         sps->crop_left  = get_ue_golomb(&s->gb);
7151         sps->crop_right = get_ue_golomb(&s->gb);
7152         sps->crop_top   = get_ue_golomb(&s->gb);
7153         sps->crop_bottom= get_ue_golomb(&s->gb);
7154         if(sps->crop_left || sps->crop_top){
7155             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7156         }
7157         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7158             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7159         }
7160     }else{
7161         sps->crop_left  =
7162         sps->crop_right =
7163         sps->crop_top   =
7164         sps->crop_bottom= 0;
7165     }
7166
7167     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7168     if( sps->vui_parameters_present_flag )
7169         decode_vui_parameters(h, sps);
7170
7171     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7172         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7173                sps_id, sps->profile_idc, sps->level_idc,
7174                sps->poc_type,
7175                sps->ref_frame_count,
7176                sps->mb_width, sps->mb_height,
7177                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7178                sps->direct_8x8_inference_flag ? "8B8" : "",
7179                sps->crop_left, sps->crop_right,
7180                sps->crop_top, sps->crop_bottom,
7181                sps->vui_parameters_present_flag ? "VUI" : "",
7182                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7183                );
7184     }
7185
7186     av_free(h->sps_buffers[sps_id]);
7187     h->sps_buffers[sps_id]= sps;
7188     return 0;
7189 fail:
7190     av_free(sps);
7191     return -1;
7192 }
7193
7194 static void
7195 build_qp_table(PPS *pps, int t, int index)
7196 {
7197     int i;
7198     for(i = 0; i < 52; i++)
7199         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7200 }
7201
7202 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7203     MpegEncContext * const s = &h->s;
7204     unsigned int pps_id= get_ue_golomb(&s->gb);
7205     PPS *pps;
7206
7207     if(pps_id >= MAX_PPS_COUNT) {
7208         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7209         return -1;
7210     }
7211
7212     pps= av_mallocz(sizeof(PPS));
7213     if(pps == NULL)
7214         return -1;
7215     pps->sps_id= get_ue_golomb_31(&s->gb);
7216     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7217         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7218         goto fail;
7219     }
7220
7221     pps->cabac= get_bits1(&s->gb);
7222     pps->pic_order_present= get_bits1(&s->gb);
7223     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7224     if(pps->slice_group_count > 1 ){
7225         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7226         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7227         switch(pps->mb_slice_group_map_type){
7228         case 0:
7229 #if 0
7230 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7231 |    run_length[ i ]                                |1  |ue(v)   |
7232 #endif
7233             break;
7234         case 2:
7235 #if 0
7236 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7237 |{                                                  |   |        |
7238 |    top_left_mb[ i ]                               |1  |ue(v)   |
7239 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7240 |   }                                               |   |        |
7241 #endif
7242             break;
7243         case 3:
7244         case 4:
7245         case 5:
7246 #if 0
7247 |   slice_group_change_direction_flag               |1  |u(1)    |
7248 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7249 #endif
7250             break;
7251         case 6:
7252 #if 0
7253 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7254 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7255 |)                                                  |   |        |
7256 |    slice_group_id[ i ]                            |1  |u(v)    |
7257 #endif
7258             break;
7259         }
7260     }
7261     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7262     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7263     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7264         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7265         goto fail;
7266     }
7267
7268     pps->weighted_pred= get_bits1(&s->gb);
7269     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7270     pps->init_qp= get_se_golomb(&s->gb) + 26;
7271     pps->init_qs= get_se_golomb(&s->gb) + 26;
7272     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7273     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7274     pps->constrained_intra_pred= get_bits1(&s->gb);
7275     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7276
7277     pps->transform_8x8_mode= 0;
7278     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7279     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7280     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7281
7282     if(get_bits_count(&s->gb) < bit_length){
7283         pps->transform_8x8_mode= get_bits1(&s->gb);
7284         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7285         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7286     } else {
7287         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7288     }
7289
7290     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7291     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7292     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7293         h->pps.chroma_qp_diff= 1;
7294
7295     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7296         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7297                pps_id, pps->sps_id,
7298                pps->cabac ? "CABAC" : "CAVLC",
7299                pps->slice_group_count,
7300                pps->ref_count[0], pps->ref_count[1],
7301                pps->weighted_pred ? "weighted" : "",
7302                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7303                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7304                pps->constrained_intra_pred ? "CONSTR" : "",
7305                pps->redundant_pic_cnt_present ? "REDU" : "",
7306                pps->transform_8x8_mode ? "8x8DCT" : ""
7307                );
7308     }
7309
7310     av_free(h->pps_buffers[pps_id]);
7311     h->pps_buffers[pps_id]= pps;
7312     return 0;
7313 fail:
7314     av_free(pps);
7315     return -1;
7316 }
7317
7318 /**
7319  * Call decode_slice() for each context.
7320  *
7321  * @param h h264 master context
7322  * @param context_count number of contexts to execute
7323  */
7324 static void execute_decode_slices(H264Context *h, int context_count){
7325     MpegEncContext * const s = &h->s;
7326     AVCodecContext * const avctx= s->avctx;
7327     H264Context *hx;
7328     int i;
7329
7330     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7331         return;
7332     if(context_count == 1) {
7333         decode_slice(avctx, &h);
7334     } else {
7335         for(i = 1; i < context_count; i++) {
7336             hx = h->thread_context[i];
7337             hx->s.error_recognition = avctx->error_recognition;
7338             hx->s.error_count = 0;
7339         }
7340
7341         avctx->execute(avctx, (void *)decode_slice,
7342                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7343
7344         /* pull back stuff from slices to master context */
7345         hx = h->thread_context[context_count - 1];
7346         s->mb_x = hx->s.mb_x;
7347         s->mb_y = hx->s.mb_y;
7348         s->dropable = hx->s.dropable;
7349         s->picture_structure = hx->s.picture_structure;
7350         for(i = 1; i < context_count; i++)
7351             h->s.error_count += h->thread_context[i]->s.error_count;
7352     }
7353 }
7354
7355
7356 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7357     MpegEncContext * const s = &h->s;
7358     AVCodecContext * const avctx= s->avctx;
7359     int buf_index=0;
7360     H264Context *hx; ///< thread context
7361     int context_count = 0;
7362
7363     h->max_contexts = avctx->thread_count;
7364 #if 0
7365     int i;
7366     for(i=0; i<50; i++){
7367         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7368     }
7369 #endif
7370     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7371         h->current_slice = 0;
7372         if (!s->first_field)
7373             s->current_picture_ptr= NULL;
7374     }
7375
7376     for(;;){
7377         int consumed;
7378         int dst_length;
7379         int bit_length;
7380         const uint8_t *ptr;
7381         int i, nalsize = 0;
7382         int err;
7383
7384         if(h->is_avc) {
7385             if(buf_index >= buf_size) break;
7386             nalsize = 0;
7387             for(i = 0; i < h->nal_length_size; i++)
7388                 nalsize = (nalsize << 8) | buf[buf_index++];
7389             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7390                 if(nalsize == 1){
7391                     buf_index++;
7392                     continue;
7393                 }else{
7394                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7395                     break;
7396                 }
7397             }
7398         } else {
7399             // start code prefix search
7400             for(; buf_index + 3 < buf_size; buf_index++){
7401                 // This should always succeed in the first iteration.
7402                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7403                     break;
7404             }
7405
7406             if(buf_index+3 >= buf_size) break;
7407
7408             buf_index+=3;
7409         }
7410
7411         hx = h->thread_context[context_count];
7412
7413         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7414         if (ptr==NULL || dst_length < 0){
7415             return -1;
7416         }
7417         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7418             dst_length--;
7419         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7420
7421         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7422             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7423         }
7424
7425         if (h->is_avc && (nalsize != consumed)){
7426             int i, debug_level = AV_LOG_DEBUG;
7427             for (i = consumed; i < nalsize; i++)
7428                 if (buf[buf_index+i])
7429                     debug_level = AV_LOG_ERROR;
7430             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7431             consumed= nalsize;
7432         }
7433
7434         buf_index += consumed;
7435
7436         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7437            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7438             continue;
7439
7440       again:
7441         err = 0;
7442         switch(hx->nal_unit_type){
7443         case NAL_IDR_SLICE:
7444             if (h->nal_unit_type != NAL_IDR_SLICE) {
7445                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7446                 return -1;
7447             }
7448             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7449         case NAL_SLICE:
7450             init_get_bits(&hx->s.gb, ptr, bit_length);
7451             hx->intra_gb_ptr=
7452             hx->inter_gb_ptr= &hx->s.gb;
7453             hx->s.data_partitioning = 0;
7454
7455             if((err = decode_slice_header(hx, h)))
7456                break;
7457
7458             s->current_picture_ptr->key_frame |=
7459                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7460                     (h->sei_recovery_frame_cnt >= 0);
7461             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7462                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7463                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7464                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7465                && avctx->skip_frame < AVDISCARD_ALL){
7466                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7467                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7468                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7469                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7470                 }else
7471                     context_count++;
7472             }
7473             break;
7474         case NAL_DPA:
7475             init_get_bits(&hx->s.gb, ptr, bit_length);
7476             hx->intra_gb_ptr=
7477             hx->inter_gb_ptr= NULL;
7478             hx->s.data_partitioning = 1;
7479
7480             err = decode_slice_header(hx, h);
7481             break;
7482         case NAL_DPB:
7483             init_get_bits(&hx->intra_gb, ptr, bit_length);
7484             hx->intra_gb_ptr= &hx->intra_gb;
7485             break;
7486         case NAL_DPC:
7487             init_get_bits(&hx->inter_gb, ptr, bit_length);
7488             hx->inter_gb_ptr= &hx->inter_gb;
7489
7490             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7491                && s->context_initialized
7492                && s->hurry_up < 5
7493                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7494                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7495                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7496                && avctx->skip_frame < AVDISCARD_ALL)
7497                 context_count++;
7498             break;
7499         case NAL_SEI:
7500             init_get_bits(&s->gb, ptr, bit_length);
7501             decode_sei(h);
7502             break;
7503         case NAL_SPS:
7504             init_get_bits(&s->gb, ptr, bit_length);
7505             decode_seq_parameter_set(h);
7506
7507             if(s->flags& CODEC_FLAG_LOW_DELAY)
7508                 s->low_delay=1;
7509
7510             if(avctx->has_b_frames < 2)
7511                 avctx->has_b_frames= !s->low_delay;
7512             break;
7513         case NAL_PPS:
7514             init_get_bits(&s->gb, ptr, bit_length);
7515
7516             decode_picture_parameter_set(h, bit_length);
7517
7518             break;
7519         case NAL_AUD:
7520         case NAL_END_SEQUENCE:
7521         case NAL_END_STREAM:
7522         case NAL_FILLER_DATA:
7523         case NAL_SPS_EXT:
7524         case NAL_AUXILIARY_SLICE:
7525             break;
7526         default:
7527             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7528         }
7529
7530         if(context_count == h->max_contexts) {
7531             execute_decode_slices(h, context_count);
7532             context_count = 0;
7533         }
7534
7535         if (err < 0)
7536             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7537         else if(err == 1) {
7538             /* Slice could not be decoded in parallel mode, copy down
7539              * NAL unit stuff to context 0 and restart. Note that
7540              * rbsp_buffer is not transferred, but since we no longer
7541              * run in parallel mode this should not be an issue. */
7542             h->nal_unit_type = hx->nal_unit_type;
7543             h->nal_ref_idc   = hx->nal_ref_idc;
7544             hx = h;
7545             goto again;
7546         }
7547     }
7548     if(context_count)
7549         execute_decode_slices(h, context_count);
7550     return buf_index;
7551 }
7552
7553 /**
7554  * returns the number of bytes consumed for building the current frame
7555  */
7556 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7557         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7558         if(pos+10>buf_size) pos=buf_size; // oops ;)
7559
7560         return pos;
7561 }
7562
7563 static int decode_frame(AVCodecContext *avctx,
7564                              void *data, int *data_size,
7565                              const uint8_t *buf, int buf_size)
7566 {
7567     H264Context *h = avctx->priv_data;
7568     MpegEncContext *s = &h->s;
7569     AVFrame *pict = data;
7570     int buf_index;
7571
7572     s->flags= avctx->flags;
7573     s->flags2= avctx->flags2;
7574
7575    /* end of stream, output what is still in the buffers */
7576     if (buf_size == 0) {
7577         Picture *out;
7578         int i, out_idx;
7579
7580 //FIXME factorize this with the output code below
7581         out = h->delayed_pic[0];
7582         out_idx = 0;
7583         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7584             if(h->delayed_pic[i]->poc < out->poc){
7585                 out = h->delayed_pic[i];
7586                 out_idx = i;
7587             }
7588
7589         for(i=out_idx; h->delayed_pic[i]; i++)
7590             h->delayed_pic[i] = h->delayed_pic[i+1];
7591
7592         if(out){
7593             *data_size = sizeof(AVFrame);
7594             *pict= *(AVFrame*)out;
7595         }
7596
7597         return 0;
7598     }
7599
7600     if(h->is_avc && !h->got_avcC) {
7601         int i, cnt, nalsize;
7602         unsigned char *p = avctx->extradata;
7603         if(avctx->extradata_size < 7) {
7604             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7605             return -1;
7606         }
7607         if(*p != 1) {
7608             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7609             return -1;
7610         }
7611         /* sps and pps in the avcC always have length coded with 2 bytes,
7612            so put a fake nal_length_size = 2 while parsing them */
7613         h->nal_length_size = 2;
7614         // Decode sps from avcC
7615         cnt = *(p+5) & 0x1f; // Number of sps
7616         p += 6;
7617         for (i = 0; i < cnt; i++) {
7618             nalsize = AV_RB16(p) + 2;
7619             if(decode_nal_units(h, p, nalsize) < 0) {
7620                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7621                 return -1;
7622             }
7623             p += nalsize;
7624         }
7625         // Decode pps from avcC
7626         cnt = *(p++); // Number of pps
7627         for (i = 0; i < cnt; i++) {
7628             nalsize = AV_RB16(p) + 2;
7629             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7630                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7631                 return -1;
7632             }
7633             p += nalsize;
7634         }
7635         // Now store right nal length size, that will be use to parse all other nals
7636         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7637         // Do not reparse avcC
7638         h->got_avcC = 1;
7639     }
7640
7641     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7642         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7643             return -1;
7644         h->got_avcC = 1;
7645     }
7646
7647     buf_index=decode_nal_units(h, buf, buf_size);
7648     if(buf_index < 0)
7649         return -1;
7650
7651     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7652         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7653         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7654         return -1;
7655     }
7656
7657     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7658         Picture *out = s->current_picture_ptr;
7659         Picture *cur = s->current_picture_ptr;
7660         int i, pics, cross_idr, out_of_order, out_idx;
7661
7662         s->mb_y= 0;
7663
7664         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7665         s->current_picture_ptr->pict_type= s->pict_type;
7666
7667         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7668             ff_vdpau_h264_set_reference_frames(s);
7669
7670         if(!s->dropable) {
7671             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7672             h->prev_poc_msb= h->poc_msb;
7673             h->prev_poc_lsb= h->poc_lsb;
7674         }
7675         h->prev_frame_num_offset= h->frame_num_offset;
7676         h->prev_frame_num= h->frame_num;
7677
7678         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7679             ff_vdpau_h264_picture_complete(s);
7680
7681         /*
7682          * FIXME: Error handling code does not seem to support interlaced
7683          * when slices span multiple rows
7684          * The ff_er_add_slice calls don't work right for bottom
7685          * fields; they cause massive erroneous error concealing
7686          * Error marking covers both fields (top and bottom).
7687          * This causes a mismatched s->error_count
7688          * and a bad error table. Further, the error count goes to
7689          * INT_MAX when called for bottom field, because mb_y is
7690          * past end by one (callers fault) and resync_mb_y != 0
7691          * causes problems for the first MB line, too.
7692          */
7693         if (!FIELD_PICTURE)
7694             ff_er_frame_end(s);
7695
7696         MPV_frame_end(s);
7697         h->sei_recovery_frame_cnt = -1;
7698         h->sei_dpb_output_delay = 0;
7699         h->sei_cpb_removal_delay = -1;
7700
7701         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7702             /* Wait for second field. */
7703             *data_size = 0;
7704
7705         } else {
7706             cur->repeat_pict = 0;
7707
7708             /* Signal interlacing information externally. */
7709             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7710             if(h->sps.pic_struct_present_flag){
7711                 switch (h->sei_pic_struct)
7712                 {
7713                 case SEI_PIC_STRUCT_FRAME:
7714                     cur->interlaced_frame = 0;
7715                     break;
7716                 case SEI_PIC_STRUCT_TOP_FIELD:
7717                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7718                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7719                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7720                     cur->interlaced_frame = 1;
7721                     break;
7722                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7723                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7724                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7725                     // From these hints, let the applications decide if they apply deinterlacing.
7726                     cur->repeat_pict = 1;
7727                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7728                     break;
7729                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7730                     // Force progressive here, as doubling interlaced frame is a bad idea.
7731                     cur->interlaced_frame = 0;
7732                     cur->repeat_pict = 2;
7733                     break;
7734                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7735                     cur->interlaced_frame = 0;
7736                     cur->repeat_pict = 4;
7737                     break;
7738                 }
7739             }else{
7740                 /* Derive interlacing flag from used decoding process. */
7741                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7742             }
7743
7744             if (cur->field_poc[0] != cur->field_poc[1]){
7745                 /* Derive top_field_first from field pocs. */
7746                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7747             }else{
7748                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7749                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7750                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7751                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7752                         cur->top_field_first = 1;
7753                     else
7754                         cur->top_field_first = 0;
7755                 }else{
7756                     /* Most likely progressive */
7757                     cur->top_field_first = 0;
7758                 }
7759             }
7760
7761         //FIXME do something with unavailable reference frames
7762
7763             /* Sort B-frames into display order */
7764
7765             if(h->sps.bitstream_restriction_flag
7766                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7767                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7768                 s->low_delay = 0;
7769             }
7770
7771             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7772                && !h->sps.bitstream_restriction_flag){
7773                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7774                 s->low_delay= 0;
7775             }
7776
7777             pics = 0;
7778             while(h->delayed_pic[pics]) pics++;
7779
7780             assert(pics <= MAX_DELAYED_PIC_COUNT);
7781
7782             h->delayed_pic[pics++] = cur;
7783             if(cur->reference == 0)
7784                 cur->reference = DELAYED_PIC_REF;
7785
7786             out = h->delayed_pic[0];
7787             out_idx = 0;
7788             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7789                 if(h->delayed_pic[i]->poc < out->poc){
7790                     out = h->delayed_pic[i];
7791                     out_idx = i;
7792                 }
7793             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7794
7795             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7796
7797             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7798                 { }
7799             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7800                || (s->low_delay &&
7801                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7802                  || cur->pict_type == FF_B_TYPE)))
7803             {
7804                 s->low_delay = 0;
7805                 s->avctx->has_b_frames++;
7806             }
7807
7808             if(out_of_order || pics > s->avctx->has_b_frames){
7809                 out->reference &= ~DELAYED_PIC_REF;
7810                 for(i=out_idx; h->delayed_pic[i]; i++)
7811                     h->delayed_pic[i] = h->delayed_pic[i+1];
7812             }
7813             if(!out_of_order && pics > s->avctx->has_b_frames){
7814                 *data_size = sizeof(AVFrame);
7815
7816                 h->outputed_poc = out->poc;
7817                 *pict= *(AVFrame*)out;
7818             }else{
7819                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7820             }
7821         }
7822     }
7823
7824     assert(pict->data[0] || !*data_size);
7825     ff_print_debug_info(s, pict);
7826 //printf("out %d\n", (int)pict->data[0]);
7827 #if 0 //?
7828
7829     /* Return the Picture timestamp as the frame number */
7830     /* we subtract 1 because it is added on utils.c     */
7831     avctx->frame_number = s->picture_number - 1;
7832 #endif
7833     return get_consumed_bytes(s, buf_index, buf_size);
7834 }
7835 #if 0
7836 static inline void fill_mb_avail(H264Context *h){
7837     MpegEncContext * const s = &h->s;
7838     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7839
7840     if(s->mb_y){
7841         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7842         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7843         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7844     }else{
7845         h->mb_avail[0]=
7846         h->mb_avail[1]=
7847         h->mb_avail[2]= 0;
7848     }
7849     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7850     h->mb_avail[4]= 1; //FIXME move out
7851     h->mb_avail[5]= 0; //FIXME move out
7852 }
7853 #endif
7854
7855 #ifdef TEST
7856 #undef printf
7857 #undef random
7858 #define COUNT 8000
7859 #define SIZE (COUNT*40)
7860 int main(void){
7861     int i;
7862     uint8_t temp[SIZE];
7863     PutBitContext pb;
7864     GetBitContext gb;
7865 //    int int_temp[10000];
7866     DSPContext dsp;
7867     AVCodecContext avctx;
7868
7869     dsputil_init(&dsp, &avctx);
7870
7871     init_put_bits(&pb, temp, SIZE);
7872     printf("testing unsigned exp golomb\n");
7873     for(i=0; i<COUNT; i++){
7874         START_TIMER
7875         set_ue_golomb(&pb, i);
7876         STOP_TIMER("set_ue_golomb");
7877     }
7878     flush_put_bits(&pb);
7879
7880     init_get_bits(&gb, temp, 8*SIZE);
7881     for(i=0; i<COUNT; i++){
7882         int j, s;
7883
7884         s= show_bits(&gb, 24);
7885
7886         START_TIMER
7887         j= get_ue_golomb(&gb);
7888         if(j != i){
7889             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7890 //            return -1;
7891         }
7892         STOP_TIMER("get_ue_golomb");
7893     }
7894
7895
7896     init_put_bits(&pb, temp, SIZE);
7897     printf("testing signed exp golomb\n");
7898     for(i=0; i<COUNT; i++){
7899         START_TIMER
7900         set_se_golomb(&pb, i - COUNT/2);
7901         STOP_TIMER("set_se_golomb");
7902     }
7903     flush_put_bits(&pb);
7904
7905     init_get_bits(&gb, temp, 8*SIZE);
7906     for(i=0; i<COUNT; i++){
7907         int j, s;
7908
7909         s= show_bits(&gb, 24);
7910
7911         START_TIMER
7912         j= get_se_golomb(&gb);
7913         if(j != i - COUNT/2){
7914             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7915 //            return -1;
7916         }
7917         STOP_TIMER("get_se_golomb");
7918     }
7919
7920 #if 0
7921     printf("testing 4x4 (I)DCT\n");
7922
7923     DCTELEM block[16];
7924     uint8_t src[16], ref[16];
7925     uint64_t error= 0, max_error=0;
7926
7927     for(i=0; i<COUNT; i++){
7928         int j;
7929 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7930         for(j=0; j<16; j++){
7931             ref[j]= random()%255;
7932             src[j]= random()%255;
7933         }
7934
7935         h264_diff_dct_c(block, src, ref, 4);
7936
7937         //normalize
7938         for(j=0; j<16; j++){
7939 //            printf("%d ", block[j]);
7940             block[j]= block[j]*4;
7941             if(j&1) block[j]= (block[j]*4 + 2)/5;
7942             if(j&4) block[j]= (block[j]*4 + 2)/5;
7943         }
7944 //        printf("\n");
7945
7946         s->dsp.h264_idct_add(ref, block, 4);
7947 /*        for(j=0; j<16; j++){
7948             printf("%d ", ref[j]);
7949         }
7950         printf("\n");*/
7951
7952         for(j=0; j<16; j++){
7953             int diff= FFABS(src[j] - ref[j]);
7954
7955             error+= diff*diff;
7956             max_error= FFMAX(max_error, diff);
7957         }
7958     }
7959     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7960     printf("testing quantizer\n");
7961     for(qp=0; qp<52; qp++){
7962         for(i=0; i<16; i++)
7963             src1_block[i]= src2_block[i]= random()%255;
7964
7965     }
7966     printf("Testing NAL layer\n");
7967
7968     uint8_t bitstream[COUNT];
7969     uint8_t nal[COUNT*2];
7970     H264Context h;
7971     memset(&h, 0, sizeof(H264Context));
7972
7973     for(i=0; i<COUNT; i++){
7974         int zeros= i;
7975         int nal_length;
7976         int consumed;
7977         int out_length;
7978         uint8_t *out;
7979         int j;
7980
7981         for(j=0; j<COUNT; j++){
7982             bitstream[j]= (random() % 255) + 1;
7983         }
7984
7985         for(j=0; j<zeros; j++){
7986             int pos= random() % COUNT;
7987             while(bitstream[pos] == 0){
7988                 pos++;
7989                 pos %= COUNT;
7990             }
7991             bitstream[pos]=0;
7992         }
7993
7994         START_TIMER
7995
7996         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7997         if(nal_length<0){
7998             printf("encoding failed\n");
7999             return -1;
8000         }
8001
8002         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8003
8004         STOP_TIMER("NAL")
8005
8006         if(out_length != COUNT){
8007             printf("incorrect length %d %d\n", out_length, COUNT);
8008             return -1;
8009         }
8010
8011         if(consumed != nal_length){
8012             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8013             return -1;
8014         }
8015
8016         if(memcmp(bitstream, out, COUNT)){
8017             printf("mismatch\n");
8018             return -1;
8019         }
8020     }
8021 #endif
8022
8023     printf("Testing RBSP\n");
8024
8025
8026     return 0;
8027 }
8028 #endif /* TEST */
8029
8030
8031 static av_cold int decode_end(AVCodecContext *avctx)
8032 {
8033     H264Context *h = avctx->priv_data;
8034     MpegEncContext *s = &h->s;
8035     int i;
8036
8037     av_freep(&h->rbsp_buffer[0]);
8038     av_freep(&h->rbsp_buffer[1]);
8039     free_tables(h); //FIXME cleanup init stuff perhaps
8040
8041     for(i = 0; i < MAX_SPS_COUNT; i++)
8042         av_freep(h->sps_buffers + i);
8043
8044     for(i = 0; i < MAX_PPS_COUNT; i++)
8045         av_freep(h->pps_buffers + i);
8046
8047     MPV_common_end(s);
8048
8049 //    memset(h, 0, sizeof(H264Context));
8050
8051     return 0;
8052 }
8053
8054
8055 AVCodec h264_decoder = {
8056     "h264",
8057     CODEC_TYPE_VIDEO,
8058     CODEC_ID_H264,
8059     sizeof(H264Context),
8060     decode_init,
8061     NULL,
8062     decode_end,
8063     decode_frame,
8064     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8065     .flush= flush_dpb,
8066     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8067 };
8068
8069 #if CONFIG_H264_VDPAU_DECODER
8070 AVCodec h264_vdpau_decoder = {
8071     "h264_vdpau",
8072     CODEC_TYPE_VIDEO,
8073     CODEC_ID_H264,
8074     sizeof(H264Context),
8075     decode_init,
8076     NULL,
8077     decode_end,
8078     decode_frame,
8079     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8080     .flush= flush_dpb,
8081     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8082 };
8083 #endif
8084
8085 #if CONFIG_SVQ3_DECODER
8086 #include "svq3.c"
8087 #endif