git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1591                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1592                            int src_x_offset, int src_y_offset,
1593                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1594     MpegEncContext * const s = &h->s;
1595     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1596     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1597     const int luma_xy= (mx&3) + ((my&3)<<2);
1598     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1599     uint8_t * src_cb, * src_cr;
1600     int extra_width= h->emu_edge_width;
1601     int extra_height= h->emu_edge_height;
1602     int emu=0;
1603     const int full_mx= mx>>2;
1604     const int full_my= my>>2;
1605     const int pic_width  = 16*s->mb_width;
1606     const int pic_height = 16*s->mb_height >> MB_FIELD;
1607
1608     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1609         return;
1610
1611     if(mx&7) extra_width -= 3;
1612     if(my&7) extra_height -= 3;
1613
1614     if(   full_mx < 0-extra_width
1615        || full_my < 0-extra_height
1616        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1617        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1618         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1619             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1620         emu=1;
1621     }
1622
1623     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1624     if(!square){
1625         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1626     }
1627
1628     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1629
1630     if(MB_FIELD){
1631         // chroma offset when predicting from a field of opposite parity
1632         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1633         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1634     }
1635     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1637
1638     if(emu){
1639         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1640             src_cb= s->edge_emu_buffer;
1641     }
1642     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1643
1644     if(emu){
1645         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1646             src_cr= s->edge_emu_buffer;
1647     }
1648     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1649 }
1650
1651 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1652                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1653                            int x_offset, int y_offset,
1654                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1655                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1656                            int list0, int list1){
1657     MpegEncContext * const s = &h->s;
1658     qpel_mc_func *qpix_op=  qpix_put;
1659     h264_chroma_mc_func chroma_op= chroma_put;
1660
1661     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1662     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1664     x_offset += 8*s->mb_x;
1665     y_offset += 8*(s->mb_y >> MB_FIELD);
1666
1667     if(list0){
1668         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1669         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1670                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1671                            qpix_op, chroma_op);
1672
1673         qpix_op=  qpix_avg;
1674         chroma_op= chroma_avg;
1675     }
1676
1677     if(list1){
1678         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1679         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1680                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1681                            qpix_op, chroma_op);
1682     }
1683 }
1684
1685 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1686                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1687                            int x_offset, int y_offset,
1688                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1689                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1690                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1691                            int list0, int list1){
1692     MpegEncContext * const s = &h->s;
1693
1694     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1695     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1697     x_offset += 8*s->mb_x;
1698     y_offset += 8*(s->mb_y >> MB_FIELD);
1699
1700     if(list0 && list1){
1701         /* don't optimize for luma-only case, since B-frames usually
1702          * use implicit weights => chroma too. */
1703         uint8_t *tmp_cb = s->obmc_scratchpad;
1704         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1705         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1706         int refn0 = h->ref_cache[0][ scan8[n] ];
1707         int refn1 = h->ref_cache[1][ scan8[n] ];
1708
1709         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1710                     dest_y, dest_cb, dest_cr,
1711                     x_offset, y_offset, qpix_put, chroma_put);
1712         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1713                     tmp_y, tmp_cb, tmp_cr,
1714                     x_offset, y_offset, qpix_put, chroma_put);
1715
1716         if(h->use_weight == 2){
1717             int weight0 = h->implicit_weight[refn0][refn1];
1718             int weight1 = 64 - weight0;
1719             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1722         }else{
1723             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1724                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1725                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1726             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1727                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1728                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1729             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1730                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1731                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1732         }
1733     }else{
1734         int list = list1 ? 1 : 0;
1735         int refn = h->ref_cache[list][ scan8[n] ];
1736         Picture *ref= &h->ref_list[list][refn];
1737         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1738                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1739                     qpix_put, chroma_put);
1740
1741         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1742                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1743         if(h->use_weight_chroma){
1744             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1745                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1746             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1747                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1748         }
1749     }
1750 }
1751
1752 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1753                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1754                            int x_offset, int y_offset,
1755                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1756                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1757                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1758                            int list0, int list1){
1759     if((h->use_weight==2 && list0 && list1
1760         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1761        || h->use_weight==1)
1762         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1763                          x_offset, y_offset, qpix_put, chroma_put,
1764                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1765     else
1766         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1767                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1768 }
1769
1770 static inline void prefetch_motion(H264Context *h, int list){
1771     /* fetch pixels for estimated mv 4 macroblocks ahead
1772      * optimized for 64byte cache lines */
1773     MpegEncContext * const s = &h->s;
1774     const int refn = h->ref_cache[list][scan8[0]];
1775     if(refn >= 0){
1776         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1777         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1778         uint8_t **src= h->ref_list[list][refn].data;
1779         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1780         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1781         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1782         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1783     }
1784 }
1785
1786 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1787                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1788                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1789                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1790     MpegEncContext * const s = &h->s;
1791     const int mb_xy= h->mb_xy;
1792     const int mb_type= s->current_picture.mb_type[mb_xy];
1793
1794     assert(IS_INTER(mb_type));
1795
1796     prefetch_motion(h, 0);
1797
1798     if(IS_16X16(mb_type)){
1799         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1800                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1801                 &weight_op[0], &weight_avg[0],
1802                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1803     }else if(IS_16X8(mb_type)){
1804         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1805                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1806                 &weight_op[1], &weight_avg[1],
1807                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1808         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1809                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1810                 &weight_op[1], &weight_avg[1],
1811                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1812     }else if(IS_8X16(mb_type)){
1813         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1815                 &weight_op[2], &weight_avg[2],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1818                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1819                 &weight_op[2], &weight_avg[2],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else{
1822         int i;
1823
1824         assert(IS_8X8(mb_type));
1825
1826         for(i=0; i<4; i++){
1827             const int sub_mb_type= h->sub_mb_type[i];
1828             const int n= 4*i;
1829             int x_offset= (i&1)<<2;
1830             int y_offset= (i&2)<<1;
1831
1832             if(IS_SUB_8X8(sub_mb_type)){
1833                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1834                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1835                     &weight_op[3], &weight_avg[3],
1836                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1837             }else if(IS_SUB_8X4(sub_mb_type)){
1838                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1839                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1840                     &weight_op[4], &weight_avg[4],
1841                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1842                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1843                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1844                     &weight_op[4], &weight_avg[4],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_4X8(sub_mb_type)){
1847                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1849                     &weight_op[5], &weight_avg[5],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1852                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1853                     &weight_op[5], &weight_avg[5],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else{
1856                 int j;
1857                 assert(IS_SUB_4X4(sub_mb_type));
1858                 for(j=0; j<4; j++){
1859                     int sub_x_offset= x_offset + 2*(j&1);
1860                     int sub_y_offset= y_offset +   (j&2);
1861                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1862                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1863                         &weight_op[6], &weight_avg[6],
1864                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1865                 }
1866             }
1867         }
1868     }
1869
1870     prefetch_motion(h, 1);
1871 }
1872
1873 static av_cold void decode_init_vlc(void){
1874     static int done = 0;
1875
1876     if (!done) {
1877         int i;
1878         int offset;
1879         done = 1;
1880
1881         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1882         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1883         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1884                  &chroma_dc_coeff_token_len [0], 1, 1,
1885                  &chroma_dc_coeff_token_bits[0], 1, 1,
1886                  INIT_VLC_USE_NEW_STATIC);
1887
1888         offset = 0;
1889         for(i=0; i<4; i++){
1890             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1891             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1892             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1893                      &coeff_token_len [i][0], 1, 1,
1894                      &coeff_token_bits[i][0], 1, 1,
1895                      INIT_VLC_USE_NEW_STATIC);
1896             offset += coeff_token_vlc_tables_size[i];
1897         }
1898         /*
1899          * This is a one time safety check to make sure that
1900          * the packed static coeff_token_vlc table sizes
1901          * were initialized correctly.
1902          */
1903         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1904
1905         for(i=0; i<3; i++){
1906             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1907             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1908             init_vlc(&chroma_dc_total_zeros_vlc[i],
1909                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1910                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1911                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1912                      INIT_VLC_USE_NEW_STATIC);
1913         }
1914         for(i=0; i<15; i++){
1915             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1916             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1917             init_vlc(&total_zeros_vlc[i],
1918                      TOTAL_ZEROS_VLC_BITS, 16,
1919                      &total_zeros_len [i][0], 1, 1,
1920                      &total_zeros_bits[i][0], 1, 1,
1921                      INIT_VLC_USE_NEW_STATIC);
1922         }
1923
1924         for(i=0; i<6; i++){
1925             run_vlc[i].table = run_vlc_tables[i];
1926             run_vlc[i].table_allocated = run_vlc_tables_size;
1927             init_vlc(&run_vlc[i],
1928                      RUN_VLC_BITS, 7,
1929                      &run_len [i][0], 1, 1,
1930                      &run_bits[i][0], 1, 1,
1931                      INIT_VLC_USE_NEW_STATIC);
1932         }
1933         run7_vlc.table = run7_vlc_table,
1934         run7_vlc.table_allocated = run7_vlc_table_size;
1935         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1936                  &run_len [6][0], 1, 1,
1937                  &run_bits[6][0], 1, 1,
1938                  INIT_VLC_USE_NEW_STATIC);
1939     }
1940 }
1941
1942 static void free_tables(H264Context *h){
1943     int i;
1944     H264Context *hx;
1945     av_freep(&h->intra4x4_pred_mode);
1946     av_freep(&h->chroma_pred_mode_table);
1947     av_freep(&h->cbp_table);
1948     av_freep(&h->mvd_table[0]);
1949     av_freep(&h->mvd_table[1]);
1950     av_freep(&h->direct_table);
1951     av_freep(&h->non_zero_count);
1952     av_freep(&h->slice_table_base);
1953     h->slice_table= NULL;
1954
1955     av_freep(&h->mb2b_xy);
1956     av_freep(&h->mb2b8_xy);
1957
1958     for(i = 0; i < h->s.avctx->thread_count; i++) {
1959         hx = h->thread_context[i];
1960         if(!hx) continue;
1961         av_freep(&hx->top_borders[1]);
1962         av_freep(&hx->top_borders[0]);
1963         av_freep(&hx->s.obmc_scratchpad);
1964     }
1965 }
1966
1967 static void init_dequant8_coeff_table(H264Context *h){
1968     int i,q,x;
1969     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1970     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1971     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1972
1973     for(i=0; i<2; i++ ){
1974         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1975             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1976             break;
1977         }
1978
1979         for(q=0; q<52; q++){
1980             int shift = div6[q];
1981             int idx = rem6[q];
1982             for(x=0; x<64; x++)
1983                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1984                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1985                     h->pps.scaling_matrix8[i][x]) << shift;
1986         }
1987     }
1988 }
1989
1990 static void init_dequant4_coeff_table(H264Context *h){
1991     int i,j,q,x;
1992     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1993     for(i=0; i<6; i++ ){
1994         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1995         for(j=0; j<i; j++){
1996             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1997                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1998                 break;
1999             }
2000         }
2001         if(j<i)
2002             continue;
2003
2004         for(q=0; q<52; q++){
2005             int shift = div6[q] + 2;
2006             int idx = rem6[q];
2007             for(x=0; x<16; x++)
2008                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2009                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2010                     h->pps.scaling_matrix4[i][x]) << shift;
2011         }
2012     }
2013 }
2014
2015 static void init_dequant_tables(H264Context *h){
2016     int i,x;
2017     init_dequant4_coeff_table(h);
2018     if(h->pps.transform_8x8_mode)
2019         init_dequant8_coeff_table(h);
2020     if(h->sps.transform_bypass){
2021         for(i=0; i<6; i++)
2022             for(x=0; x<16; x++)
2023                 h->dequant4_coeff[i][0][x] = 1<<6;
2024         if(h->pps.transform_8x8_mode)
2025             for(i=0; i<2; i++)
2026                 for(x=0; x<64; x++)
2027                     h->dequant8_coeff[i][0][x] = 1<<6;
2028     }
2029 }
2030
2031
2032 /**
2033  * allocates tables.
2034  * needs width/height
2035  */
2036 static int alloc_tables(H264Context *h){
2037     MpegEncContext * const s = &h->s;
2038     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2039     int x,y;
2040
2041     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2042
2043     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2044     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2045     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2046
2047     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2048     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2050     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2051
2052     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2053     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2054
2055     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2056     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2057     for(y=0; y<s->mb_height; y++){
2058         for(x=0; x<s->mb_width; x++){
2059             const int mb_xy= x + y*s->mb_stride;
2060             const int b_xy = 4*x + 4*y*h->b_stride;
2061             const int b8_xy= 2*x + 2*y*h->b8_stride;
2062
2063             h->mb2b_xy [mb_xy]= b_xy;
2064             h->mb2b8_xy[mb_xy]= b8_xy;
2065         }
2066     }
2067
2068     s->obmc_scratchpad = NULL;
2069
2070     if(!h->dequant4_coeff[0])
2071         init_dequant_tables(h);
2072
2073     return 0;
2074 fail:
2075     free_tables(h);
2076     return -1;
2077 }
2078
2079 /**
2080  * Mimic alloc_tables(), but for every context thread.
2081  */
2082 static void clone_tables(H264Context *dst, H264Context *src){
2083     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2084     dst->non_zero_count           = src->non_zero_count;
2085     dst->slice_table              = src->slice_table;
2086     dst->cbp_table                = src->cbp_table;
2087     dst->mb2b_xy                  = src->mb2b_xy;
2088     dst->mb2b8_xy                 = src->mb2b8_xy;
2089     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2090     dst->mvd_table[0]             = src->mvd_table[0];
2091     dst->mvd_table[1]             = src->mvd_table[1];
2092     dst->direct_table             = src->direct_table;
2093
2094     dst->s.obmc_scratchpad = NULL;
2095     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2096 }
2097
2098 /**
2099  * Init context
2100  * Allocate buffers which are not shared amongst multiple threads.
2101  */
2102 static int context_init(H264Context *h){
2103     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2105
2106     return 0;
2107 fail:
2108     return -1; // free_tables will clean up for us
2109 }
2110
2111 static av_cold void common_init(H264Context *h){
2112     MpegEncContext * const s = &h->s;
2113
2114     s->width = s->avctx->width;
2115     s->height = s->avctx->height;
2116     s->codec_id= s->avctx->codec->id;
2117
2118     ff_h264_pred_init(&h->hpc, s->codec_id);
2119
2120     h->dequant_coeff_pps= -1;
2121     s->unrestricted_mv=1;
2122     s->decode=1; //FIXME
2123
2124     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2125     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2126 }
2127
2128 static av_cold int decode_init(AVCodecContext *avctx){
2129     H264Context *h= avctx->priv_data;
2130     MpegEncContext * const s = &h->s;
2131
2132     MPV_decode_defaults(s);
2133
2134     s->avctx = avctx;
2135     common_init(h);
2136
2137     s->out_format = FMT_H264;
2138     s->workaround_bugs= avctx->workaround_bugs;
2139
2140     // set defaults
2141 //    s->decode_mb= ff_h263_decode_mb;
2142     s->quarter_sample = 1;
2143     s->low_delay= 1;
2144
2145     if(avctx->codec_id == CODEC_ID_SVQ3)
2146         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2147     else
2148         avctx->pix_fmt= PIX_FMT_YUV420P;
2149
2150     decode_init_vlc();
2151
2152     if(avctx->extradata_size > 0 && avctx->extradata &&
2153        *(char *)avctx->extradata == 1){
2154         h->is_avc = 1;
2155         h->got_avcC = 0;
2156     } else {
2157         h->is_avc = 0;
2158     }
2159
2160     h->thread_context[0] = h;
2161     h->outputed_poc = INT_MIN;
2162     h->prev_poc_msb= 1<<16;
2163     return 0;
2164 }
2165
2166 static int frame_start(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168     int i;
2169
2170     if(MPV_frame_start(s, s->avctx) < 0)
2171         return -1;
2172     ff_er_frame_start(s);
2173     /*
2174      * MPV_frame_start uses pict_type to derive key_frame.
2175      * This is incorrect for H.264; IDR markings must be used.
2176      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2177      * See decode_nal_units().
2178      */
2179     s->current_picture_ptr->key_frame= 0;
2180
2181     assert(s->linesize && s->uvlinesize);
2182
2183     for(i=0; i<16; i++){
2184         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2185         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2186     }
2187     for(i=0; i<4; i++){
2188         h->block_offset[16+i]=
2189         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2190         h->block_offset[24+16+i]=
2191         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2192     }
2193
2194     /* can't be in alloc_tables because linesize isn't known there.
2195      * FIXME: redo bipred weight to not require extra buffer? */
2196     for(i = 0; i < s->avctx->thread_count; i++)
2197         if(!h->thread_context[i]->s.obmc_scratchpad)
2198             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2199
2200     /* some macroblocks will be accessed before they're available */
2201     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2202         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2203
2204 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2205
2206     // We mark the current picture as non-reference after allocating it, so
2207     // that if we break out due to an error it can be released automatically
2208     // in the next MPV_frame_start().
2209     // SVQ3 as well as most other codecs have only last/next/current and thus
2210     // get released even with set reference, besides SVQ3 and others do not
2211     // mark frames as reference later "naturally".
2212     if(s->codec_id != CODEC_ID_SVQ3)
2213         s->current_picture_ptr->reference= 0;
2214
2215     s->current_picture_ptr->field_poc[0]=
2216     s->current_picture_ptr->field_poc[1]= INT_MAX;
2217     assert(s->current_picture_ptr->long_ref==0);
2218
2219     return 0;
2220 }
2221
2222 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2223     MpegEncContext * const s = &h->s;
2224     int i;
2225     int step    = 1;
2226     int offset  = 1;
2227     int uvoffset= 1;
2228     int top_idx = 1;
2229     int skiplast= 0;
2230
2231     src_y  -=   linesize;
2232     src_cb -= uvlinesize;
2233     src_cr -= uvlinesize;
2234
2235     if(!simple && FRAME_MBAFF){
2236         if(s->mb_y&1){
2237             offset  = MB_MBAFF ? 1 : 17;
2238             uvoffset= MB_MBAFF ? 1 : 9;
2239             if(!MB_MBAFF){
2240                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2242                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2243                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2245                 }
2246             }
2247         }else{
2248             if(!MB_MBAFF){
2249                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2250                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2251                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2252                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2253                 }
2254                 skiplast= 1;
2255             }
2256             offset  =
2257             uvoffset=
2258             top_idx = MB_MBAFF ? 0 : 1;
2259         }
2260         step= MB_MBAFF ? 2 : 1;
2261     }
2262
2263     // There are two lines saved, the line above the the top macroblock of a pair,
2264     // and the line above the bottom macroblock
2265     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2266     for(i=1; i<17 - skiplast; i++){
2267         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2268     }
2269
2270     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2272
2273     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2274         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2275         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2276         for(i=1; i<9 - skiplast; i++){
2277             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2278             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2279         }
2280         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2282     }
2283 }
2284
2285 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2286     MpegEncContext * const s = &h->s;
2287     int temp8, i;
2288     uint64_t temp64;
2289     int deblock_left;
2290     int deblock_top;
2291     int mb_xy;
2292     int step    = 1;
2293     int offset  = 1;
2294     int uvoffset= 1;
2295     int top_idx = 1;
2296
2297     if(!simple && FRAME_MBAFF){
2298         if(s->mb_y&1){
2299             offset  = MB_MBAFF ? 1 : 17;
2300             uvoffset= MB_MBAFF ? 1 : 9;
2301         }else{
2302             offset  =
2303             uvoffset=
2304             top_idx = MB_MBAFF ? 0 : 1;
2305         }
2306         step= MB_MBAFF ? 2 : 1;
2307     }
2308
2309     if(h->deblocking_filter == 2) {
2310         mb_xy = h->mb_xy;
2311         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2312         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2313     } else {
2314         deblock_left = (s->mb_x > 0);
2315         deblock_top =  (s->mb_y > !!MB_FIELD);
2316     }
2317
2318     src_y  -=   linesize + 1;
2319     src_cb -= uvlinesize + 1;
2320     src_cr -= uvlinesize + 1;
2321
2322 #define XCHG(a,b,t,xchg)\
2323 t= a;\
2324 if(xchg)\
2325     a= b;\
2326 b= t;
2327
2328     if(deblock_left){
2329         for(i = !deblock_top; i<16; i++){
2330             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2331         }
2332         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2333     }
2334
2335     if(deblock_top){
2336         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2338         if(s->mb_x+1 < s->mb_width){
2339             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2340         }
2341     }
2342
2343     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2344         if(deblock_left){
2345             for(i = !deblock_top; i<8; i++){
2346                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2347                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2348             }
2349             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2350             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2351         }
2352         if(deblock_top){
2353             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2355         }
2356     }
2357 }
2358
2359 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2360     MpegEncContext * const s = &h->s;
2361     const int mb_x= s->mb_x;
2362     const int mb_y= s->mb_y;
2363     const int mb_xy= h->mb_xy;
2364     const int mb_type= s->current_picture.mb_type[mb_xy];
2365     uint8_t  *dest_y, *dest_cb, *dest_cr;
2366     int linesize, uvlinesize /*dct_offset*/;
2367     int i;
2368     int *block_offset = &h->block_offset[0];
2369     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2370     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2371     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2372     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2373
2374     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2375     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2376     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2377
2378     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2379     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2380
2381     if (!simple && MB_FIELD) {
2382         linesize   = h->mb_linesize   = s->linesize * 2;
2383         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2384         block_offset = &h->block_offset[24];
2385         if(mb_y&1){ //FIXME move out of this function?
2386             dest_y -= s->linesize*15;
2387             dest_cb-= s->uvlinesize*7;
2388             dest_cr-= s->uvlinesize*7;
2389         }
2390         if(FRAME_MBAFF) {
2391             int list;
2392             for(list=0; list<h->list_count; list++){
2393                 if(!USES_LIST(mb_type, list))
2394                     continue;
2395                 if(IS_16X16(mb_type)){
2396                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2397                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2398                 }else{
2399                     for(i=0; i<16; i+=4){
2400                         int ref = h->ref_cache[list][scan8[i]];
2401                         if(ref >= 0)
2402                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2403                     }
2404                 }
2405             }
2406         }
2407     } else {
2408         linesize   = h->mb_linesize   = s->linesize;
2409         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2410 //        dct_offset = s->linesize * 16;
2411     }
2412
2413     if(transform_bypass){
2414         idct_dc_add =
2415         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2416     }else if(IS_8x8DCT(mb_type)){
2417         idct_dc_add = s->dsp.h264_idct8_dc_add;
2418         idct_add = s->dsp.h264_idct8_add;
2419     }else{
2420         idct_dc_add = s->dsp.h264_idct_dc_add;
2421         idct_add = s->dsp.h264_idct_add;
2422     }
2423
2424     if (!simple && IS_INTRA_PCM(mb_type)) {
2425         for (i=0; i<16; i++) {
2426             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2427         }
2428         for (i=0; i<8; i++) {
2429             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2430             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2431         }
2432     } else {
2433         if(IS_INTRA(mb_type)){
2434             if(h->deblocking_filter)
2435                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2436
2437             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2438                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2439                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2440             }
2441
2442             if(IS_INTRA4x4(mb_type)){
2443                 if(simple || !s->encoding){
2444                     if(IS_8x8DCT(mb_type)){
2445                         for(i=0; i<16; i+=4){
2446                             uint8_t * const ptr= dest_y + block_offset[i];
2447                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2448                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2449                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2450                             }else{
2451                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2452                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2453                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2454                                 if(nnz){
2455                                     if(nnz == 1 && h->mb[i*16])
2456                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2457                                     else
2458                                         idct_add   (ptr, h->mb + i*16, linesize);
2459                                 }
2460                             }
2461                         }
2462                     }else
2463                     for(i=0; i<16; i++){
2464                         uint8_t * const ptr= dest_y + block_offset[i];
2465                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2466
2467                         if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2468                             h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2469                         }else{
2470                             uint8_t *topright;
2471                             int nnz, tr;
2472                             if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2473                                 const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2474                                 assert(mb_y || linesize <= block_offset[i]);
2475                                 if(!topright_avail){
2476                                     tr= ptr[3 - linesize]*0x01010101;
2477                                     topright= (uint8_t*) &tr;
2478                                 }else
2479                                     topright= ptr + 4 - linesize;
2480                             }else
2481                                 topright= NULL;
2482
2483                             h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2484                             nnz = h->non_zero_count_cache[ scan8[i] ];
2485                             if(nnz){
2486                                 if(is_h264){
2487                                     if(nnz == 1 && h->mb[i*16])
2488                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2489                                     else
2490                                         idct_add   (ptr, h->mb + i*16, linesize);
2491                                 }else
2492                                     svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2493                             }
2494                         }
2495                     }
2496                 }
2497             }else{
2498                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2499                 if(is_h264){
2500                     if(!transform_bypass)
2501                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2502                 }else
2503                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2504             }
2505             if(h->deblocking_filter)
2506                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2507         }else if(is_h264){
2508             hl_motion(h, dest_y, dest_cb, dest_cr,
2509                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2510                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2511                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2512         }
2513
2514
2515         if(!IS_INTRA4x4(mb_type)){
2516             if(is_h264){
2517                 if(IS_INTRA16x16(mb_type)){
2518                     if(transform_bypass){
2519                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2520                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2521                         }else{
2522                             for(i=0; i<16; i++){
2523                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2524                                     idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2525                             }
2526                         }
2527                     }else{
2528                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2529                     }
2530                 }else if(h->cbp&15){
2531                     if(transform_bypass){
2532                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2533                         for(i=0; i<16; i+=di){
2534                             int nnz = h->non_zero_count_cache[ scan8[i] ];
2535                             if(nnz){
2536                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2537                             }
2538                         }
2539                     }else{
2540                         if(IS_8x8DCT(mb_type)){
2541                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2542                         }else{
2543                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2544                         }
2545                     }
2546                 }
2547             }else{
2548                 for(i=0; i<16; i++){
2549                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2550                         uint8_t * const ptr= dest_y + block_offset[i];
2551                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2552                     }
2553                 }
2554             }
2555         }
2556
2557         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2558             uint8_t *dest[2] = {dest_cb, dest_cr};
2559             if(transform_bypass){
2560                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2561             }else{
2562                 idct_add = s->dsp.h264_idct_add;
2563                 idct_dc_add = s->dsp.h264_idct_dc_add;
2564                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2565                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2566             }
2567             if(is_h264){
2568                 if(transform_bypass && IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2569                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2570                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2571                 }else{
2572                     for(i=16; i<16+8; i++){
2573                         if(h->non_zero_count_cache[ scan8[i] ])
2574                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2575                         else if(h->mb[i*16])
2576                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2577                     }
2578                 }
2579             }else{
2580                 for(i=16; i<16+8; i++){
2581                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2582                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2583                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2584                     }
2585                 }
2586             }
2587         }
2588     }
2589     if(h->deblocking_filter) {
2590         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2591         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2592         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2593         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2594         if (!simple && FRAME_MBAFF) {
2595             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2596         } else {
2597             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2598         }
2599     }
2600 }
2601
2602 /**
2603  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2604  */
2605 static void hl_decode_mb_simple(H264Context *h){
2606     hl_decode_mb_internal(h, 1);
2607 }
2608
2609 /**
2610  * Process a macroblock; this handles edge cases, such as interlacing.
2611  */
2612 static void av_noinline hl_decode_mb_complex(H264Context *h){
2613     hl_decode_mb_internal(h, 0);
2614 }
2615
2616 static void hl_decode_mb(H264Context *h){
2617     MpegEncContext * const s = &h->s;
2618     const int mb_xy= h->mb_xy;
2619     const int mb_type= s->current_picture.mb_type[mb_xy];
2620     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2621
2622     if(ENABLE_H264_ENCODER && !s->decode)
2623         return;
2624
2625     if (is_complex)
2626         hl_decode_mb_complex(h);
2627     else hl_decode_mb_simple(h);
2628 }
2629
2630 static void pic_as_field(Picture *pic, const int parity){
2631     int i;
2632     for (i = 0; i < 4; ++i) {
2633         if (parity == PICT_BOTTOM_FIELD)
2634             pic->data[i] += pic->linesize[i];
2635         pic->reference = parity;
2636         pic->linesize[i] *= 2;
2637     }
2638     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2639 }
2640
2641 static int split_field_copy(Picture *dest, Picture *src,
2642                             int parity, int id_add){
2643     int match = !!(src->reference & parity);
2644
2645     if (match) {
2646         *dest = *src;
2647         if(parity != PICT_FRAME){
2648             pic_as_field(dest, parity);
2649             dest->pic_id *= 2;
2650             dest->pic_id += id_add;
2651         }
2652     }
2653
2654     return match;
2655 }
2656
2657 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2658     int i[2]={0};
2659     int index=0;
2660
2661     while(i[0]<len || i[1]<len){
2662         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2663             i[0]++;
2664         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2665             i[1]++;
2666         if(i[0] < len){
2667             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2668             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2669         }
2670         if(i[1] < len){
2671             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2672             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2673         }
2674     }
2675
2676     return index;
2677 }
2678
2679 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2680     int i, best_poc;
2681     int out_i= 0;
2682
2683     for(;;){
2684         best_poc= dir ? INT_MIN : INT_MAX;
2685
2686         for(i=0; i<len; i++){
2687             const int poc= src[i]->poc;
2688             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2689                 best_poc= poc;
2690                 sorted[out_i]= src[i];
2691             }
2692         }
2693         if(best_poc == (dir ? INT_MIN : INT_MAX))
2694             break;
2695         limit= sorted[out_i++]->poc - dir;
2696     }
2697     return out_i;
2698 }
2699
2700 /**
2701  * fills the default_ref_list.
2702  */
2703 static int fill_default_ref_list(H264Context *h){
2704     MpegEncContext * const s = &h->s;
2705     int i, len;
2706
2707     if(h->slice_type_nos==FF_B_TYPE){
2708         Picture *sorted[32];
2709         int cur_poc, list;
2710         int lens[2];
2711
2712         if(FIELD_PICTURE)
2713             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2714         else
2715             cur_poc= s->current_picture_ptr->poc;
2716
2717         for(list= 0; list<2; list++){
2718             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2719             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2720             assert(len<=32);
2721             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2722             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2723             assert(len<=32);
2724
2725             if(len < h->ref_count[list])
2726                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2727             lens[list]= len;
2728         }
2729
2730         if(lens[0] == lens[1] && lens[1] > 1){
2731             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2732             if(i == lens[0])
2733                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2734         }
2735     }else{
2736         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2737         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2738         assert(len <= 32);
2739         if(len < h->ref_count[0])
2740             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2741     }
2742 #ifdef TRACE
2743     for (i=0; i<h->ref_count[0]; i++) {
2744         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2745     }
2746     if(h->slice_type_nos==FF_B_TYPE){
2747         for (i=0; i<h->ref_count[1]; i++) {
2748             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2749         }
2750     }
2751 #endif
2752     return 0;
2753 }
2754
2755 static void print_short_term(H264Context *h);
2756 static void print_long_term(H264Context *h);
2757
2758 /**
2759  * Extract structure information about the picture described by pic_num in
2760  * the current decoding context (frame or field). Note that pic_num is
2761  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2762  * @param pic_num picture number for which to extract structure information
2763  * @param structure one of PICT_XXX describing structure of picture
2764  *                      with pic_num
2765  * @return frame number (short term) or long term index of picture
2766  *         described by pic_num
2767  */
2768 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2769     MpegEncContext * const s = &h->s;
2770
2771     *structure = s->picture_structure;
2772     if(FIELD_PICTURE){
2773         if (!(pic_num & 1))
2774             /* opposite field */
2775             *structure ^= PICT_FRAME;
2776         pic_num >>= 1;
2777     }
2778
2779     return pic_num;
2780 }
2781
2782 static int decode_ref_pic_list_reordering(H264Context *h){
2783     MpegEncContext * const s = &h->s;
2784     int list, index, pic_structure;
2785
2786     print_short_term(h);
2787     print_long_term(h);
2788
2789     for(list=0; list<h->list_count; list++){
2790         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2791
2792         if(get_bits1(&s->gb)){
2793             int pred= h->curr_pic_num;
2794
2795             for(index=0; ; index++){
2796                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2797                 unsigned int pic_id;
2798                 int i;
2799                 Picture *ref = NULL;
2800
2801                 if(reordering_of_pic_nums_idc==3)
2802                     break;
2803
2804                 if(index >= h->ref_count[list]){
2805                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2806                     return -1;
2807                 }
2808
2809                 if(reordering_of_pic_nums_idc<3){
2810                     if(reordering_of_pic_nums_idc<2){
2811                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2812                         int frame_num;
2813
2814                         if(abs_diff_pic_num > h->max_pic_num){
2815                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2816                             return -1;
2817                         }
2818
2819                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2820                         else                                pred+= abs_diff_pic_num;
2821                         pred &= h->max_pic_num - 1;
2822
2823                         frame_num = pic_num_extract(h, pred, &pic_structure);
2824
2825                         for(i= h->short_ref_count-1; i>=0; i--){
2826                             ref = h->short_ref[i];
2827                             assert(ref->reference);
2828                             assert(!ref->long_ref);
2829                             if(
2830                                    ref->frame_num == frame_num &&
2831                                    (ref->reference & pic_structure)
2832                               )
2833                                 break;
2834                         }
2835                         if(i>=0)
2836                             ref->pic_id= pred;
2837                     }else{
2838                         int long_idx;
2839                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2840
2841                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2842
2843                         if(long_idx>31){
2844                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2845                             return -1;
2846                         }
2847                         ref = h->long_ref[long_idx];
2848                         assert(!(ref && !ref->reference));
2849                         if(ref && (ref->reference & pic_structure)){
2850                             ref->pic_id= pic_id;
2851                             assert(ref->long_ref);
2852                             i=0;
2853                         }else{
2854                             i=-1;
2855                         }
2856                     }
2857
2858                     if (i < 0) {
2859                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2860                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2861                     } else {
2862                         for(i=index; i+1<h->ref_count[list]; i++){
2863                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2864                                 break;
2865                         }
2866                         for(; i > index; i--){
2867                             h->ref_list[list][i]= h->ref_list[list][i-1];
2868                         }
2869                         h->ref_list[list][index]= *ref;
2870                         if (FIELD_PICTURE){
2871                             pic_as_field(&h->ref_list[list][index], pic_structure);
2872                         }
2873                     }
2874                 }else{
2875                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2876                     return -1;
2877                 }
2878             }
2879         }
2880     }
2881     for(list=0; list<h->list_count; list++){
2882         for(index= 0; index < h->ref_count[list]; index++){
2883             if(!h->ref_list[list][index].data[0]){
2884                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2885                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2886             }
2887         }
2888     }
2889
2890     return 0;
2891 }
2892
2893 static void fill_mbaff_ref_list(H264Context *h){
2894     int list, i, j;
2895     for(list=0; list<2; list++){ //FIXME try list_count
2896         for(i=0; i<h->ref_count[list]; i++){
2897             Picture *frame = &h->ref_list[list][i];
2898             Picture *field = &h->ref_list[list][16+2*i];
2899             field[0] = *frame;
2900             for(j=0; j<3; j++)
2901                 field[0].linesize[j] <<= 1;
2902             field[0].reference = PICT_TOP_FIELD;
2903             field[0].poc= field[0].field_poc[0];
2904             field[1] = field[0];
2905             for(j=0; j<3; j++)
2906                 field[1].data[j] += frame->linesize[j];
2907             field[1].reference = PICT_BOTTOM_FIELD;
2908             field[1].poc= field[1].field_poc[1];
2909
2910             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2911             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2912             for(j=0; j<2; j++){
2913                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2914                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2915             }
2916         }
2917     }
2918     for(j=0; j<h->ref_count[1]; j++){
2919         for(i=0; i<h->ref_count[0]; i++)
2920             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2921         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2922         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2923     }
2924 }
2925
2926 static int pred_weight_table(H264Context *h){
2927     MpegEncContext * const s = &h->s;
2928     int list, i;
2929     int luma_def, chroma_def;
2930
2931     h->use_weight= 0;
2932     h->use_weight_chroma= 0;
2933     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2934     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2935     luma_def = 1<<h->luma_log2_weight_denom;
2936     chroma_def = 1<<h->chroma_log2_weight_denom;
2937
2938     for(list=0; list<2; list++){
2939         for(i=0; i<h->ref_count[list]; i++){
2940             int luma_weight_flag, chroma_weight_flag;
2941
2942             luma_weight_flag= get_bits1(&s->gb);
2943             if(luma_weight_flag){
2944                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2945                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2946                 if(   h->luma_weight[list][i] != luma_def
2947                    || h->luma_offset[list][i] != 0)
2948                     h->use_weight= 1;
2949             }else{
2950                 h->luma_weight[list][i]= luma_def;
2951                 h->luma_offset[list][i]= 0;
2952             }
2953
2954             if(CHROMA){
2955                 chroma_weight_flag= get_bits1(&s->gb);
2956                 if(chroma_weight_flag){
2957                     int j;
2958                     for(j=0; j<2; j++){
2959                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2960                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2961                         if(   h->chroma_weight[list][i][j] != chroma_def
2962                         || h->chroma_offset[list][i][j] != 0)
2963                             h->use_weight_chroma= 1;
2964                     }
2965                 }else{
2966                     int j;
2967                     for(j=0; j<2; j++){
2968                         h->chroma_weight[list][i][j]= chroma_def;
2969                         h->chroma_offset[list][i][j]= 0;
2970                     }
2971                 }
2972             }
2973         }
2974         if(h->slice_type_nos != FF_B_TYPE) break;
2975     }
2976     h->use_weight= h->use_weight || h->use_weight_chroma;
2977     return 0;
2978 }
2979
2980 static void implicit_weight_table(H264Context *h){
2981     MpegEncContext * const s = &h->s;
2982     int ref0, ref1;
2983     int cur_poc = s->current_picture_ptr->poc;
2984
2985     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2986        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2987         h->use_weight= 0;
2988         h->use_weight_chroma= 0;
2989         return;
2990     }
2991
2992     h->use_weight= 2;
2993     h->use_weight_chroma= 2;
2994     h->luma_log2_weight_denom= 5;
2995     h->chroma_log2_weight_denom= 5;
2996
2997     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
2998         int poc0 = h->ref_list[0][ref0].poc;
2999         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3000             int poc1 = h->ref_list[1][ref1].poc;
3001             int td = av_clip(poc1 - poc0, -128, 127);
3002             if(td){
3003                 int tb = av_clip(cur_poc - poc0, -128, 127);
3004                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3005                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3006                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3007                     h->implicit_weight[ref0][ref1] = 32;
3008                 else
3009                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3010             }else
3011                 h->implicit_weight[ref0][ref1] = 32;
3012         }
3013     }
3014 }
3015
3016 /**
3017  * Mark a picture as no longer needed for reference. The refmask
3018  * argument allows unreferencing of individual fields or the whole frame.
3019  * If the picture becomes entirely unreferenced, but is being held for
3020  * display purposes, it is marked as such.
3021  * @param refmask mask of fields to unreference; the mask is bitwise
3022  *                anded with the reference marking of pic
3023  * @return non-zero if pic becomes entirely unreferenced (except possibly
3024  *         for display purposes) zero if one of the fields remains in
3025  *         reference
3026  */
3027 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3028     int i;
3029     if (pic->reference &= refmask) {
3030         return 0;
3031     } else {
3032         for(i = 0; h->delayed_pic[i]; i++)
3033             if(pic == h->delayed_pic[i]){
3034                 pic->reference=DELAYED_PIC_REF;
3035                 break;
3036             }
3037         return 1;
3038     }
3039 }
3040
3041 /**
3042  * instantaneous decoder refresh.
3043  */
3044 static void idr(H264Context *h){
3045     int i;
3046
3047     for(i=0; i<16; i++){
3048         remove_long(h, i, 0);
3049     }
3050     assert(h->long_ref_count==0);
3051
3052     for(i=0; i<h->short_ref_count; i++){
3053         unreference_pic(h, h->short_ref[i], 0);
3054         h->short_ref[i]= NULL;
3055     }
3056     h->short_ref_count=0;
3057     h->prev_frame_num= 0;
3058     h->prev_frame_num_offset= 0;
3059     h->prev_poc_msb=
3060     h->prev_poc_lsb= 0;
3061 }
3062
3063 /* forget old pics after a seek */
3064 static void flush_dpb(AVCodecContext *avctx){
3065     H264Context *h= avctx->priv_data;
3066     int i;
3067     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3068         if(h->delayed_pic[i])
3069             h->delayed_pic[i]->reference= 0;
3070         h->delayed_pic[i]= NULL;
3071     }
3072     h->outputed_poc= INT_MIN;
3073     idr(h);
3074     if(h->s.current_picture_ptr)
3075         h->s.current_picture_ptr->reference= 0;
3076     h->s.first_field= 0;
3077     ff_mpeg_flush(avctx);
3078 }
3079
3080 /**
3081  * Find a Picture in the short term reference list by frame number.
3082  * @param frame_num frame number to search for
3083  * @param idx the index into h->short_ref where returned picture is found
3084  *            undefined if no picture found.
3085  * @return pointer to the found picture, or NULL if no pic with the provided
3086  *                 frame number is found
3087  */
3088 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3089     MpegEncContext * const s = &h->s;
3090     int i;
3091
3092     for(i=0; i<h->short_ref_count; i++){
3093         Picture *pic= h->short_ref[i];
3094         if(s->avctx->debug&FF_DEBUG_MMCO)
3095             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3096         if(pic->frame_num == frame_num) {
3097             *idx = i;
3098             return pic;
3099         }
3100     }
3101     return NULL;
3102 }
3103
3104 /**
3105  * Remove a picture from the short term reference list by its index in
3106  * that list.  This does no checking on the provided index; it is assumed
3107  * to be valid. Other list entries are shifted down.
3108  * @param i index into h->short_ref of picture to remove.
3109  */
3110 static void remove_short_at_index(H264Context *h, int i){
3111     assert(i >= 0 && i < h->short_ref_count);
3112     h->short_ref[i]= NULL;
3113     if (--h->short_ref_count)
3114         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3115 }
3116
3117 /**
3118  *
3119  * @return the removed picture or NULL if an error occurs
3120  */
3121 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3122     MpegEncContext * const s = &h->s;
3123     Picture *pic;
3124     int i;
3125
3126     if(s->avctx->debug&FF_DEBUG_MMCO)
3127         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3128
3129     pic = find_short(h, frame_num, &i);
3130     if (pic){
3131         if(unreference_pic(h, pic, ref_mask))
3132         remove_short_at_index(h, i);
3133     }
3134
3135     return pic;
3136 }
3137
3138 /**
3139  * Remove a picture from the long term reference list by its index in
3140  * that list.
3141  * @return the removed picture or NULL if an error occurs
3142  */
3143 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3144     Picture *pic;
3145
3146     pic= h->long_ref[i];
3147     if (pic){
3148         if(unreference_pic(h, pic, ref_mask)){
3149             assert(h->long_ref[i]->long_ref == 1);
3150             h->long_ref[i]->long_ref= 0;
3151             h->long_ref[i]= NULL;
3152             h->long_ref_count--;
3153         }
3154     }
3155
3156     return pic;
3157 }
3158
3159 /**
3160  * print short term list
3161  */
3162 static void print_short_term(H264Context *h) {
3163     uint32_t i;
3164     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3165         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3166         for(i=0; i<h->short_ref_count; i++){
3167             Picture *pic= h->short_ref[i];
3168             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3169         }
3170     }
3171 }
3172
3173 /**
3174  * print long term list
3175  */
3176 static void print_long_term(H264Context *h) {
3177     uint32_t i;
3178     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3179         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3180         for(i = 0; i < 16; i++){
3181             Picture *pic= h->long_ref[i];
3182             if (pic) {
3183                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3184             }
3185         }
3186     }
3187 }
3188
3189 /**
3190  * Executes the reference picture marking (memory management control operations).
3191  */
3192 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3193     MpegEncContext * const s = &h->s;
3194     int i, j;
3195     int current_ref_assigned=0;
3196     Picture *pic;
3197
3198     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3199         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3200
3201     for(i=0; i<mmco_count; i++){
3202         int structure, frame_num;
3203         if(s->avctx->debug&FF_DEBUG_MMCO)
3204             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3205
3206         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3207            || mmco[i].opcode == MMCO_SHORT2LONG){
3208             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3209             pic = find_short(h, frame_num, &j);
3210             if(!pic){
3211                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3212                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3213                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3214                 continue;
3215             }
3216         }
3217
3218         switch(mmco[i].opcode){
3219         case MMCO_SHORT2UNUSED:
3220             if(s->avctx->debug&FF_DEBUG_MMCO)
3221                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3222             remove_short(h, frame_num, structure ^ PICT_FRAME);
3223             break;
3224         case MMCO_SHORT2LONG:
3225                 if (h->long_ref[mmco[i].long_arg] != pic)
3226                     remove_long(h, mmco[i].long_arg, 0);
3227
3228                 remove_short_at_index(h, j);
3229                 h->long_ref[ mmco[i].long_arg ]= pic;
3230                 if (h->long_ref[ mmco[i].long_arg ]){
3231                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3232                     h->long_ref_count++;
3233                 }
3234             break;
3235         case MMCO_LONG2UNUSED:
3236             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3237             pic = h->long_ref[j];
3238             if (pic) {
3239                 remove_long(h, j, structure ^ PICT_FRAME);
3240             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3241                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3242             break;
3243         case MMCO_LONG:
3244                     // Comment below left from previous code as it is an interresting note.
3245                     /* First field in pair is in short term list or
3246                      * at a different long term index.
3247                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3248                      * Report the problem and keep the pair where it is,
3249                      * and mark this field valid.
3250                      */
3251
3252             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3253                 remove_long(h, mmco[i].long_arg, 0);
3254
3255                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3256                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3257                 h->long_ref_count++;
3258             }
3259
3260             s->current_picture_ptr->reference |= s->picture_structure;
3261             current_ref_assigned=1;
3262             break;
3263         case MMCO_SET_MAX_LONG:
3264             assert(mmco[i].long_arg <= 16);
3265             // just remove the long term which index is greater than new max
3266             for(j = mmco[i].long_arg; j<16; j++){
3267                 remove_long(h, j, 0);
3268             }
3269             break;
3270         case MMCO_RESET:
3271             while(h->short_ref_count){
3272                 remove_short(h, h->short_ref[0]->frame_num, 0);
3273             }
3274             for(j = 0; j < 16; j++) {
3275                 remove_long(h, j, 0);
3276             }
3277             s->current_picture_ptr->poc=
3278             s->current_picture_ptr->field_poc[0]=
3279             s->current_picture_ptr->field_poc[1]=
3280             h->poc_lsb=
3281             h->poc_msb=
3282             h->frame_num=
3283             s->current_picture_ptr->frame_num= 0;
3284             break;
3285         default: assert(0);
3286         }
3287     }
3288
3289     if (!current_ref_assigned) {
3290         /* Second field of complementary field pair; the first field of
3291          * which is already referenced. If short referenced, it
3292          * should be first entry in short_ref. If not, it must exist
3293          * in long_ref; trying to put it on the short list here is an
3294          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3295          */
3296         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3297             /* Just mark the second field valid */
3298             s->current_picture_ptr->reference = PICT_FRAME;
3299         } else if (s->current_picture_ptr->long_ref) {
3300             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3301                                              "assignment for second field "
3302                                              "in complementary field pair "
3303                                              "(first field is long term)\n");
3304         } else {
3305             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3306             if(pic){
3307                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3308             }
3309
3310             if(h->short_ref_count)
3311                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3312
3313             h->short_ref[0]= s->current_picture_ptr;
3314             h->short_ref_count++;
3315             s->current_picture_ptr->reference |= s->picture_structure;
3316         }
3317     }
3318
3319     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3320
3321         /* We have too many reference frames, probably due to corrupted
3322          * stream. Need to discard one frame. Prevents overrun of the
3323          * short_ref and long_ref buffers.
3324          */
3325         av_log(h->s.avctx, AV_LOG_ERROR,
3326                "number of reference frames exceeds max (probably "
3327                "corrupt input), discarding one\n");
3328
3329         if (h->long_ref_count && !h->short_ref_count) {
3330             for (i = 0; i < 16; ++i)
3331                 if (h->long_ref[i])
3332                     break;
3333
3334             assert(i < 16);
3335             remove_long(h, i, 0);
3336         } else {
3337             pic = h->short_ref[h->short_ref_count - 1];
3338             remove_short(h, pic->frame_num, 0);
3339         }
3340     }
3341
3342     print_short_term(h);
3343     print_long_term(h);
3344     return 0;
3345 }
3346
3347 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3348     MpegEncContext * const s = &h->s;
3349     int i;
3350
3351     h->mmco_index= 0;
3352     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3353         s->broken_link= get_bits1(gb) -1;
3354         if(get_bits1(gb)){
3355             h->mmco[0].opcode= MMCO_LONG;
3356             h->mmco[0].long_arg= 0;
3357             h->mmco_index= 1;
3358         }
3359     }else{
3360         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3361             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3362                 MMCOOpcode opcode= get_ue_golomb(gb);
3363
3364                 h->mmco[i].opcode= opcode;
3365                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3366                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3367 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3368                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3369                         return -1;
3370                     }*/
3371                 }
3372                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3373                     unsigned int long_arg= get_ue_golomb(gb);
3374                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3375                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3376                         return -1;
3377                     }
3378                     h->mmco[i].long_arg= long_arg;
3379                 }
3380
3381                 if(opcode > (unsigned)MMCO_LONG){
3382                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3383                     return -1;
3384                 }
3385                 if(opcode == MMCO_END)
3386                     break;
3387             }
3388             h->mmco_index= i;
3389         }else{
3390             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3391
3392             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3393                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3394                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3395                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3396                 h->mmco_index= 1;
3397                 if (FIELD_PICTURE) {
3398                     h->mmco[0].short_pic_num *= 2;
3399                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3400                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3401                     h->mmco_index= 2;
3402                 }
3403             }
3404         }
3405     }
3406
3407     return 0;
3408 }
3409
3410 static int init_poc(H264Context *h){
3411     MpegEncContext * const s = &h->s;
3412     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3413     int field_poc[2];
3414     Picture *cur = s->current_picture_ptr;
3415
3416     h->frame_num_offset= h->prev_frame_num_offset;
3417     if(h->frame_num < h->prev_frame_num)
3418         h->frame_num_offset += max_frame_num;
3419
3420     if(h->sps.poc_type==0){
3421         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3422
3423         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3424             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3425         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3426             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3427         else
3428             h->poc_msb = h->prev_poc_msb;
3429 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3430         field_poc[0] =
3431         field_poc[1] = h->poc_msb + h->poc_lsb;
3432         if(s->picture_structure == PICT_FRAME)
3433             field_poc[1] += h->delta_poc_bottom;
3434     }else if(h->sps.poc_type==1){
3435         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3436         int i;
3437
3438         if(h->sps.poc_cycle_length != 0)
3439             abs_frame_num = h->frame_num_offset + h->frame_num;
3440         else
3441             abs_frame_num = 0;
3442
3443         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3444             abs_frame_num--;
3445
3446         expected_delta_per_poc_cycle = 0;
3447         for(i=0; i < h->sps.poc_cycle_length; i++)
3448             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3449
3450         if(abs_frame_num > 0){
3451             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3452             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3453
3454             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3455             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3456                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3457         } else
3458             expectedpoc = 0;
3459
3460         if(h->nal_ref_idc == 0)
3461             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3462
3463         field_poc[0] = expectedpoc + h->delta_poc[0];
3464         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3465
3466         if(s->picture_structure == PICT_FRAME)
3467             field_poc[1] += h->delta_poc[1];
3468     }else{
3469         int poc= 2*(h->frame_num_offset + h->frame_num);
3470
3471         if(!h->nal_ref_idc)
3472             poc--;
3473
3474         field_poc[0]= poc;
3475         field_poc[1]= poc;
3476     }
3477
3478     if(s->picture_structure != PICT_BOTTOM_FIELD)
3479         s->current_picture_ptr->field_poc[0]= field_poc[0];
3480     if(s->picture_structure != PICT_TOP_FIELD)
3481         s->current_picture_ptr->field_poc[1]= field_poc[1];
3482     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3483
3484     return 0;
3485 }
3486
3487
3488 /**
3489  * initialize scan tables
3490  */
3491 static void init_scan_tables(H264Context *h){
3492     MpegEncContext * const s = &h->s;
3493     int i;
3494     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3495         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3496         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3497     }else{
3498         for(i=0; i<16; i++){
3499 #define T(x) (x>>2) | ((x<<2) & 0xF)
3500             h->zigzag_scan[i] = T(zigzag_scan[i]);
3501             h-> field_scan[i] = T( field_scan[i]);
3502 #undef T
3503         }
3504     }
3505     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3506         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3507         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3508         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3509         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3510     }else{
3511         for(i=0; i<64; i++){
3512 #define T(x) (x>>3) | ((x&7)<<3)
3513             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3514             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3515             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3516             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3517 #undef T
3518         }
3519     }
3520     if(h->sps.transform_bypass){ //FIXME same ugly
3521         h->zigzag_scan_q0          = zigzag_scan;
3522         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3523         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3524         h->field_scan_q0           = field_scan;
3525         h->field_scan8x8_q0        = field_scan8x8;
3526         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3527     }else{
3528         h->zigzag_scan_q0          = h->zigzag_scan;
3529         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3530         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3531         h->field_scan_q0           = h->field_scan;
3532         h->field_scan8x8_q0        = h->field_scan8x8;
3533         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3534     }
3535 }
3536
3537 /**
3538  * Replicates H264 "master" context to thread contexts.
3539  */
3540 static void clone_slice(H264Context *dst, H264Context *src)
3541 {
3542     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3543     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3544     dst->s.current_picture      = src->s.current_picture;
3545     dst->s.linesize             = src->s.linesize;
3546     dst->s.uvlinesize           = src->s.uvlinesize;
3547     dst->s.first_field          = src->s.first_field;
3548
3549     dst->prev_poc_msb           = src->prev_poc_msb;
3550     dst->prev_poc_lsb           = src->prev_poc_lsb;
3551     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3552     dst->prev_frame_num         = src->prev_frame_num;
3553     dst->short_ref_count        = src->short_ref_count;
3554
3555     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3556     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3557     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3558     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3559
3560     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3561     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3562 }
3563
3564 /**
3565  * decodes a slice header.
3566  * This will also call MPV_common_init() and frame_start() as needed.
3567  *
3568  * @param h h264context
3569  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3570  *
3571  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3572  */
3573 static int decode_slice_header(H264Context *h, H264Context *h0){
3574     MpegEncContext * const s = &h->s;
3575     MpegEncContext * const s0 = &h0->s;
3576     unsigned int first_mb_in_slice;
3577     unsigned int pps_id;
3578     int num_ref_idx_active_override_flag;
3579     unsigned int slice_type, tmp, i, j;
3580     int default_ref_list_done = 0;
3581     int last_pic_structure;
3582
3583     s->dropable= h->nal_ref_idc == 0;
3584
3585     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3586         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3587         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3588     }else{
3589         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3590         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3591     }
3592
3593     first_mb_in_slice= get_ue_golomb(&s->gb);
3594
3595     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3596         h0->current_slice = 0;
3597         if (!s0->first_field)
3598             s->current_picture_ptr= NULL;
3599     }
3600
3601     slice_type= get_ue_golomb(&s->gb);
3602     if(slice_type > 9){
3603         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3604         return -1;
3605     }
3606     if(slice_type > 4){
3607         slice_type -= 5;
3608         h->slice_type_fixed=1;
3609     }else
3610         h->slice_type_fixed=0;
3611
3612     slice_type= golomb_to_pict_type[ slice_type ];
3613     if (slice_type == FF_I_TYPE
3614         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3615         default_ref_list_done = 1;
3616     }
3617     h->slice_type= slice_type;
3618     h->slice_type_nos= slice_type & 3;
3619
3620     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3621     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3622         av_log(h->s.avctx, AV_LOG_ERROR,
3623                "B picture before any references, skipping\n");
3624         return -1;
3625     }
3626
3627     pps_id= get_ue_golomb(&s->gb);
3628     if(pps_id>=MAX_PPS_COUNT){
3629         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3630         return -1;
3631     }
3632     if(!h0->pps_buffers[pps_id]) {
3633         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3634         return -1;
3635     }
3636     h->pps= *h0->pps_buffers[pps_id];
3637
3638     if(!h0->sps_buffers[h->pps.sps_id]) {
3639         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3640         return -1;
3641     }
3642     h->sps = *h0->sps_buffers[h->pps.sps_id];
3643
3644     if(h == h0 && h->dequant_coeff_pps != pps_id){
3645         h->dequant_coeff_pps = pps_id;
3646         init_dequant_tables(h);
3647     }
3648
3649     s->mb_width= h->sps.mb_width;
3650     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3651
3652     h->b_stride=  s->mb_width*4;
3653     h->b8_stride= s->mb_width*2;
3654
3655     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3656     if(h->sps.frame_mbs_only_flag)
3657         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3658     else
3659         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3660
3661     if (s->context_initialized
3662         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3663         if(h != h0)
3664             return -1;   // width / height changed during parallelized decoding
3665         free_tables(h);
3666         flush_dpb(s->avctx);
3667         MPV_common_end(s);
3668     }
3669     if (!s->context_initialized) {
3670         if(h != h0)
3671             return -1;  // we cant (re-)initialize context during parallel decoding
3672         if (MPV_common_init(s) < 0)
3673             return -1;
3674         s->first_field = 0;
3675
3676         init_scan_tables(h);
3677         alloc_tables(h);
3678
3679         for(i = 1; i < s->avctx->thread_count; i++) {
3680             H264Context *c;
3681             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3682             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3683             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3684             c->sps = h->sps;
3685             c->pps = h->pps;
3686             init_scan_tables(c);
3687             clone_tables(c, h);
3688         }
3689
3690         for(i = 0; i < s->avctx->thread_count; i++)
3691             if(context_init(h->thread_context[i]) < 0)
3692                 return -1;
3693
3694         s->avctx->width = s->width;
3695         s->avctx->height = s->height;
3696         s->avctx->sample_aspect_ratio= h->sps.sar;
3697         if(!s->avctx->sample_aspect_ratio.den)
3698             s->avctx->sample_aspect_ratio.den = 1;
3699
3700         if(h->sps.timing_info_present_flag){
3701             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3702             if(h->x264_build > 0 && h->x264_build < 44)
3703                 s->avctx->time_base.den *= 2;
3704             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3705                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3706         }
3707     }
3708
3709     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3710
3711     h->mb_mbaff = 0;
3712     h->mb_aff_frame = 0;
3713     last_pic_structure = s0->picture_structure;
3714     if(h->sps.frame_mbs_only_flag){
3715         s->picture_structure= PICT_FRAME;
3716     }else{
3717         if(get_bits1(&s->gb)) { //field_pic_flag
3718             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3719         } else {
3720             s->picture_structure= PICT_FRAME;
3721             h->mb_aff_frame = h->sps.mb_aff;
3722         }
3723     }
3724     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3725
3726     if(h0->current_slice == 0){
3727         while(h->frame_num !=  h->prev_frame_num &&
3728               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3729             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3730             frame_start(h);
3731             h->prev_frame_num++;
3732             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3733             s->current_picture_ptr->frame_num= h->prev_frame_num;
3734             execute_ref_pic_marking(h, NULL, 0);
3735         }
3736
3737         /* See if we have a decoded first field looking for a pair... */
3738         if (s0->first_field) {
3739             assert(s0->current_picture_ptr);
3740             assert(s0->current_picture_ptr->data[0]);
3741             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3742
3743             /* figure out if we have a complementary field pair */
3744             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3745                 /*
3746                  * Previous field is unmatched. Don't display it, but let it
3747                  * remain for reference if marked as such.
3748                  */
3749                 s0->current_picture_ptr = NULL;
3750                 s0->first_field = FIELD_PICTURE;
3751
3752             } else {
3753                 if (h->nal_ref_idc &&
3754                         s0->current_picture_ptr->reference &&
3755                         s0->current_picture_ptr->frame_num != h->frame_num) {
3756                     /*
3757                      * This and previous field were reference, but had
3758                      * different frame_nums. Consider this field first in
3759                      * pair. Throw away previous field except for reference
3760                      * purposes.
3761                      */
3762                     s0->first_field = 1;
3763                     s0->current_picture_ptr = NULL;
3764
3765                 } else {
3766                     /* Second field in complementary pair */
3767                     s0->first_field = 0;
3768                 }
3769             }
3770
3771         } else {
3772             /* Frame or first field in a potentially complementary pair */
3773             assert(!s0->current_picture_ptr);
3774             s0->first_field = FIELD_PICTURE;
3775         }
3776
3777         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3778             s0->first_field = 0;
3779             return -1;
3780         }
3781     }
3782     if(h != h0)
3783         clone_slice(h, h0);
3784
3785     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3786
3787     assert(s->mb_num == s->mb_width * s->mb_height);
3788     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3789        first_mb_in_slice                    >= s->mb_num){
3790         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3791         return -1;
3792     }
3793     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3794     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3795     if (s->picture_structure == PICT_BOTTOM_FIELD)
3796         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3797     assert(s->mb_y < s->mb_height);
3798
3799     if(s->picture_structure==PICT_FRAME){
3800         h->curr_pic_num=   h->frame_num;
3801         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3802     }else{
3803         h->curr_pic_num= 2*h->frame_num + 1;
3804         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3805     }
3806
3807     if(h->nal_unit_type == NAL_IDR_SLICE){
3808         get_ue_golomb(&s->gb); /* idr_pic_id */
3809     }
3810
3811     if(h->sps.poc_type==0){
3812         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3813
3814         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3815             h->delta_poc_bottom= get_se_golomb(&s->gb);
3816         }
3817     }
3818
3819     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3820         h->delta_poc[0]= get_se_golomb(&s->gb);
3821
3822         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3823             h->delta_poc[1]= get_se_golomb(&s->gb);
3824     }
3825
3826     init_poc(h);
3827
3828     if(h->pps.redundant_pic_cnt_present){
3829         h->redundant_pic_count= get_ue_golomb(&s->gb);
3830     }
3831
3832     //set defaults, might be overridden a few lines later
3833     h->ref_count[0]= h->pps.ref_count[0];
3834     h->ref_count[1]= h->pps.ref_count[1];
3835
3836     if(h->slice_type_nos != FF_I_TYPE){
3837         if(h->slice_type_nos == FF_B_TYPE){
3838             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3839         }
3840         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3841
3842         if(num_ref_idx_active_override_flag){
3843             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3844             if(h->slice_type_nos==FF_B_TYPE)
3845                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3846
3847             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3848                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3849                 h->ref_count[0]= h->ref_count[1]= 1;
3850                 return -1;
3851             }
3852         }
3853         if(h->slice_type_nos == FF_B_TYPE)
3854             h->list_count= 2;
3855         else
3856             h->list_count= 1;
3857     }else
3858         h->list_count= 0;
3859
3860     if(!default_ref_list_done){
3861         fill_default_ref_list(h);
3862     }
3863
3864     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3865         return -1;
3866
3867     if(h->slice_type_nos!=FF_I_TYPE){
3868         s->last_picture_ptr= &h->ref_list[0][0];
3869         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3870     }
3871     if(h->slice_type_nos==FF_B_TYPE){
3872         s->next_picture_ptr= &h->ref_list[1][0];
3873         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3874     }
3875
3876     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3877        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3878         pred_weight_table(h);
3879     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3880         implicit_weight_table(h);
3881     else
3882         h->use_weight = 0;
3883
3884     if(h->nal_ref_idc)
3885         decode_ref_pic_marking(h0, &s->gb);
3886
3887     if(FRAME_MBAFF)
3888         fill_mbaff_ref_list(h);
3889
3890     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3891         direct_dist_scale_factor(h);
3892     direct_ref_list_init(h);
3893
3894     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3895         tmp = get_ue_golomb(&s->gb);
3896         if(tmp > 2){
3897             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3898             return -1;
3899         }
3900         h->cabac_init_idc= tmp;
3901     }
3902
3903     h->last_qscale_diff = 0;
3904     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3905     if(tmp>51){
3906         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3907         return -1;
3908     }
3909     s->qscale= tmp;
3910     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3911     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3912     //FIXME qscale / qp ... stuff
3913     if(h->slice_type == FF_SP_TYPE){
3914         get_bits1(&s->gb); /* sp_for_switch_flag */
3915     }
3916     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3917         get_se_golomb(&s->gb); /* slice_qs_delta */
3918     }
3919
3920     h->deblocking_filter = 1;
3921     h->slice_alpha_c0_offset = 0;
3922     h->slice_beta_offset = 0;
3923     if( h->pps.deblocking_filter_parameters_present ) {
3924         tmp= get_ue_golomb(&s->gb);
3925         if(tmp > 2){
3926             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3927             return -1;
3928         }
3929         h->deblocking_filter= tmp;
3930         if(h->deblocking_filter < 2)
3931             h->deblocking_filter^= 1; // 1<->0
3932
3933         if( h->deblocking_filter ) {
3934             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3935             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3936         }
3937     }
3938
3939     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3940        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3941        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3942        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3943         h->deblocking_filter= 0;
3944
3945     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3946         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3947             /* Cheat slightly for speed:
3948                Do not bother to deblock across slices. */
3949             h->deblocking_filter = 2;
3950         } else {
3951             h0->max_contexts = 1;
3952             if(!h0->single_decode_warning) {
3953                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3954                 h0->single_decode_warning = 1;
3955             }
3956             if(h != h0)
3957                 return 1; // deblocking switched inside frame
3958         }
3959     }
3960
3961 #if 0 //FMO
3962     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3963         slice_group_change_cycle= get_bits(&s->gb, ?);
3964 #endif
3965
3966     h0->last_slice_type = slice_type;
3967     h->slice_num = ++h0->current_slice;
3968     if(h->slice_num >= MAX_SLICES){
3969         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3970     }
3971
3972     for(j=0; j<2; j++){
3973         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3974         ref2frm[0]=
3975         ref2frm[1]= -1;
3976         for(i=0; i<16; i++)
3977             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3978                           +(h->ref_list[j][i].reference&3);
3979         ref2frm[18+0]=
3980         ref2frm[18+1]= -1;
3981         for(i=16; i<48; i++)
3982             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3983                           +(h->ref_list[j][i].reference&3);
3984     }
3985
3986     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3987     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3988
3989     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3990         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
3991                h->slice_num,
3992                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3993                first_mb_in_slice,
3994                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
3995                pps_id, h->frame_num,
3996                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
3997                h->ref_count[0], h->ref_count[1],
3998                s->qscale,
3999                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4000                h->use_weight,
4001                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4002                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4003                );
4004     }
4005
4006     return 0;
4007 }
4008
4009 /**
4010  *
4011  */
4012 static inline int get_level_prefix(GetBitContext *gb){
4013     unsigned int buf;
4014     int log;
4015
4016     OPEN_READER(re, gb);
4017     UPDATE_CACHE(re, gb);
4018     buf=GET_CACHE(re, gb);
4019
4020     log= 32 - av_log2(buf);
4021 #ifdef TRACE
4022     print_bin(buf>>(32-log), log);
4023     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4024 #endif
4025
4026     LAST_SKIP_BITS(re, gb, log);
4027     CLOSE_READER(re, gb);
4028
4029     return log-1;
4030 }
4031
4032 static inline int get_dct8x8_allowed(H264Context *h){
4033     int i;
4034     for(i=0; i<4; i++){
4035         if(!IS_SUB_8X8(h->sub_mb_type[i])
4036            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4037             return 0;
4038     }
4039     return 1;
4040 }
4041
4042 /**
4043  * decodes a residual block.
4044  * @param n block index
4045  * @param scantable scantable
4046  * @param max_coeff number of coefficients in the block
4047  * @return <0 if an error occurred
4048  */
4049 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4050     MpegEncContext * const s = &h->s;
4051     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4052     int level[16];
4053     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4054
4055     //FIXME put trailing_onex into the context
4056
4057     if(n == CHROMA_DC_BLOCK_INDEX){
4058         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4059         total_coeff= coeff_token>>2;
4060     }else{
4061         if(n == LUMA_DC_BLOCK_INDEX){
4062             total_coeff= pred_non_zero_count(h, 0);
4063             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4064             total_coeff= coeff_token>>2;
4065         }else{
4066             total_coeff= pred_non_zero_count(h, n);
4067             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4068             total_coeff= coeff_token>>2;
4069             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4070         }
4071     }
4072
4073     //FIXME set last_non_zero?
4074
4075     if(total_coeff==0)
4076         return 0;
4077     if(total_coeff > (unsigned)max_coeff) {
4078         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4079         return -1;
4080     }
4081
4082     trailing_ones= coeff_token&3;
4083     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4084     assert(total_coeff<=16);
4085
4086     for(i=0; i<trailing_ones; i++){
4087         level[i]= 1 - 2*get_bits1(gb);
4088     }
4089
4090     if(i<total_coeff) {
4091         int level_code, mask;
4092         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4093         int prefix= get_level_prefix(gb);
4094
4095         //first coefficient has suffix_length equal to 0 or 1
4096         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4097             if(suffix_length)
4098                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4099             else
4100                 level_code= (prefix<<suffix_length); //part
4101         }else if(prefix==14){
4102             if(suffix_length)
4103                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4104             else
4105                 level_code= prefix + get_bits(gb, 4); //part
4106         }else{
4107             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4108             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4109             if(prefix>=16)
4110                 level_code += (1<<(prefix-3))-4096;
4111         }
4112
4113         if(trailing_ones < 3) level_code += 2;
4114
4115         suffix_length = 1;
4116         if(level_code > 5)
4117             suffix_length++;
4118         mask= -(level_code&1);
4119         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4120         i++;
4121
4122         //remaining coefficients have suffix_length > 0
4123         for(;i<total_coeff;i++) {
4124             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4125             prefix = get_level_prefix(gb);
4126             if(prefix<15){
4127                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4128             }else{
4129                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4130                 if(prefix>=16)
4131                     level_code += (1<<(prefix-3))-4096;
4132             }
4133             mask= -(level_code&1);
4134             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4135             if(level_code > suffix_limit[suffix_length])
4136                 suffix_length++;
4137         }
4138     }
4139
4140     if(total_coeff == max_coeff)
4141         zeros_left=0;
4142     else{
4143         if(n == CHROMA_DC_BLOCK_INDEX)
4144             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4145         else
4146             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4147     }
4148
4149     coeff_num = zeros_left + total_coeff - 1;
4150     j = scantable[coeff_num];
4151     if(n > 24){
4152         block[j] = level[0];
4153         for(i=1;i<total_coeff;i++) {
4154             if(zeros_left <= 0)
4155                 run_before = 0;
4156             else if(zeros_left < 7){
4157                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4158             }else{
4159                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4160             }
4161             zeros_left -= run_before;
4162             coeff_num -= 1 + run_before;
4163             j= scantable[ coeff_num ];
4164
4165             block[j]= level[i];
4166         }
4167     }else{
4168         block[j] = (level[0] * qmul[j] + 32)>>6;
4169         for(i=1;i<total_coeff;i++) {
4170             if(zeros_left <= 0)
4171                 run_before = 0;
4172             else if(zeros_left < 7){
4173                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4174             }else{
4175                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4176             }
4177             zeros_left -= run_before;
4178             coeff_num -= 1 + run_before;
4179             j= scantable[ coeff_num ];
4180
4181             block[j]= (level[i] * qmul[j] + 32)>>6;
4182         }
4183     }
4184
4185     if(zeros_left<0){
4186         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4187         return -1;
4188     }
4189
4190     return 0;
4191 }
4192
4193 static void predict_field_decoding_flag(H264Context *h){
4194     MpegEncContext * const s = &h->s;
4195     const int mb_xy= h->mb_xy;
4196     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4197                 ? s->current_picture.mb_type[mb_xy-1]
4198                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4199                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4200                 : 0;
4201     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4202 }
4203
4204 /**
4205  * decodes a P_SKIP or B_SKIP macroblock
4206  */
4207 static void decode_mb_skip(H264Context *h){
4208     MpegEncContext * const s = &h->s;
4209     const int mb_xy= h->mb_xy;
4210     int mb_type=0;
4211
4212     memset(h->non_zero_count[mb_xy], 0, 16);
4213     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4214
4215     if(MB_FIELD)
4216         mb_type|= MB_TYPE_INTERLACED;
4217
4218     if( h->slice_type_nos == FF_B_TYPE )
4219     {
4220         // just for fill_caches. pred_direct_motion will set the real mb_type
4221         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4222
4223         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4224         pred_direct_motion(h, &mb_type);
4225         mb_type|= MB_TYPE_SKIP;
4226     }
4227     else
4228     {
4229         int mx, my;
4230         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4231
4232         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4233         pred_pskip_motion(h, &mx, &my);
4234         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4235         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4236     }
4237
4238     write_back_motion(h, mb_type);
4239     s->current_picture.mb_type[mb_xy]= mb_type;
4240     s->current_picture.qscale_table[mb_xy]= s->qscale;
4241     h->slice_table[ mb_xy ]= h->slice_num;
4242     h->prev_mb_skipped= 1;
4243 }
4244
4245 /**
4246  * decodes a macroblock
4247  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4248  */
4249 static int decode_mb_cavlc(H264Context *h){
4250     MpegEncContext * const s = &h->s;
4251     int mb_xy;
4252     int partition_count;
4253     unsigned int mb_type, cbp;
4254     int dct8x8_allowed= h->pps.transform_8x8_mode;
4255
4256     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4257
4258     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4259
4260     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4261     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4262                 down the code */
4263     if(h->slice_type_nos != FF_I_TYPE){
4264         if(s->mb_skip_run==-1)
4265             s->mb_skip_run= get_ue_golomb(&s->gb);
4266
4267         if (s->mb_skip_run--) {
4268             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4269                 if(s->mb_skip_run==0)
4270                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4271                 else
4272                     predict_field_decoding_flag(h);
4273             }
4274             decode_mb_skip(h);
4275             return 0;
4276         }
4277     }
4278     if(FRAME_MBAFF){
4279         if( (s->mb_y&1) == 0 )
4280             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4281     }
4282
4283     h->prev_mb_skipped= 0;
4284
4285     mb_type= get_ue_golomb(&s->gb);
4286     if(h->slice_type_nos == FF_B_TYPE){
4287         if(mb_type < 23){
4288             partition_count= b_mb_type_info[mb_type].partition_count;
4289             mb_type=         b_mb_type_info[mb_type].type;
4290         }else{
4291             mb_type -= 23;
4292             goto decode_intra_mb;
4293         }
4294     }else if(h->slice_type_nos == FF_P_TYPE){
4295         if(mb_type < 5){
4296             partition_count= p_mb_type_info[mb_type].partition_count;
4297             mb_type=         p_mb_type_info[mb_type].type;
4298         }else{
4299             mb_type -= 5;
4300             goto decode_intra_mb;
4301         }
4302     }else{
4303        assert(h->slice_type_nos == FF_I_TYPE);
4304         if(h->slice_type == FF_SI_TYPE && mb_type)
4305             mb_type--;
4306 decode_intra_mb:
4307         if(mb_type > 25){
4308             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4309             return -1;
4310         }
4311         partition_count=0;
4312         cbp= i_mb_type_info[mb_type].cbp;
4313         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4314         mb_type= i_mb_type_info[mb_type].type;
4315     }
4316
4317     if(MB_FIELD)
4318         mb_type |= MB_TYPE_INTERLACED;
4319
4320     h->slice_table[ mb_xy ]= h->slice_num;
4321
4322     if(IS_INTRA_PCM(mb_type)){
4323         unsigned int x;
4324
4325         // We assume these blocks are very rare so we do not optimize it.
4326         align_get_bits(&s->gb);
4327
4328         // The pixels are stored in the same order as levels in h->mb array.
4329         for(x=0; x < (CHROMA ? 384 : 256); x++){
4330             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4331         }
4332
4333         // In deblocking, the quantizer is 0
4334         s->current_picture.qscale_table[mb_xy]= 0;
4335         // All coeffs are present
4336         memset(h->non_zero_count[mb_xy], 16, 16);
4337
4338         s->current_picture.mb_type[mb_xy]= mb_type;
4339         return 0;
4340     }
4341
4342     if(MB_MBAFF){
4343         h->ref_count[0] <<= 1;
4344         h->ref_count[1] <<= 1;
4345     }
4346
4347     fill_caches(h, mb_type, 0);
4348
4349     //mb_pred
4350     if(IS_INTRA(mb_type)){
4351         int pred_mode;
4352 //            init_top_left_availability(h);
4353         if(IS_INTRA4x4(mb_type)){
4354             int i;
4355             int di = 1;
4356             if(dct8x8_allowed && get_bits1(&s->gb)){
4357                 mb_type |= MB_TYPE_8x8DCT;
4358                 di = 4;
4359             }
4360
4361 //                fill_intra4x4_pred_table(h);
4362             for(i=0; i<16; i+=di){
4363                 int mode= pred_intra_mode(h, i);
4364
4365                 if(!get_bits1(&s->gb)){
4366                     const int rem_mode= get_bits(&s->gb, 3);
4367                     mode = rem_mode + (rem_mode >= mode);
4368                 }
4369
4370                 if(di==4)
4371                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4372                 else
4373                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4374             }
4375             write_back_intra_pred_mode(h);
4376             if( check_intra4x4_pred_mode(h) < 0)
4377                 return -1;
4378         }else{
4379             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4380             if(h->intra16x16_pred_mode < 0)
4381                 return -1;
4382         }
4383         if(CHROMA){
4384             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4385             if(pred_mode < 0)
4386                 return -1;
4387             h->chroma_pred_mode= pred_mode;
4388         }
4389     }else if(partition_count==4){
4390         int i, j, sub_partition_count[4], list, ref[2][4];
4391
4392         if(h->slice_type_nos == FF_B_TYPE){
4393             for(i=0; i<4; i++){
4394                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4395                 if(h->sub_mb_type[i] >=13){
4396                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4397                     return -1;
4398                 }
4399                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4400                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4401             }
4402             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4403                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4404                 pred_direct_motion(h, &mb_type);
4405                 h->ref_cache[0][scan8[4]] =
4406                 h->ref_cache[1][scan8[4]] =
4407                 h->ref_cache[0][scan8[12]] =
4408                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4409             }
4410         }else{
4411             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4412             for(i=0; i<4; i++){
4413                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4414                 if(h->sub_mb_type[i] >=4){
4415                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4416                     return -1;
4417                 }
4418                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4419                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4420             }
4421         }
4422
4423         for(list=0; list<h->list_count; list++){
4424             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4425             for(i=0; i<4; i++){
4426                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4427                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4428                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4429                     if(tmp>=ref_count){
4430                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4431                         return -1;
4432                     }
4433                     ref[list][i]= tmp;
4434                 }else{
4435                  //FIXME
4436                     ref[list][i] = -1;
4437                 }
4438             }
4439         }
4440
4441         if(dct8x8_allowed)
4442             dct8x8_allowed = get_dct8x8_allowed(h);
4443
4444         for(list=0; list<h->list_count; list++){
4445             for(i=0; i<4; i++){
4446                 if(IS_DIRECT(h->sub_mb_type[i])) {
4447                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4448                     continue;
4449                 }
4450                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4451                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4452
4453                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4454                     const int sub_mb_type= h->sub_mb_type[i];
4455                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4456                     for(j=0; j<sub_partition_count[i]; j++){
4457                         int mx, my;
4458                         const int index= 4*i + block_width*j;
4459                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4460                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4461                         mx += get_se_golomb(&s->gb);
4462                         my += get_se_golomb(&s->gb);
4463                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4464
4465                         if(IS_SUB_8X8(sub_mb_type)){
4466                             mv_cache[ 1 ][0]=
4467                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4468                             mv_cache[ 1 ][1]=
4469                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4470                         }else if(IS_SUB_8X4(sub_mb_type)){
4471                             mv_cache[ 1 ][0]= mx;
4472                             mv_cache[ 1 ][1]= my;
4473                         }else if(IS_SUB_4X8(sub_mb_type)){
4474                             mv_cache[ 8 ][0]= mx;
4475                             mv_cache[ 8 ][1]= my;
4476                         }
4477                         mv_cache[ 0 ][0]= mx;
4478                         mv_cache[ 0 ][1]= my;
4479                     }
4480                 }else{
4481                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4482                     p[0] = p[1]=
4483                     p[8] = p[9]= 0;
4484                 }
4485             }
4486         }
4487     }else if(IS_DIRECT(mb_type)){
4488         pred_direct_motion(h, &mb_type);
4489         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4490     }else{
4491         int list, mx, my, i;
4492          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4493         if(IS_16X16(mb_type)){
4494             for(list=0; list<h->list_count; list++){
4495                     unsigned int val;
4496                     if(IS_DIR(mb_type, 0, list)){
4497                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4498                         if(val >= h->ref_count[list]){
4499                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4500                             return -1;
4501                         }
4502                     }else
4503                         val= LIST_NOT_USED&0xFF;
4504                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4505             }
4506             for(list=0; list<h->list_count; list++){
4507                 unsigned int val;
4508                 if(IS_DIR(mb_type, 0, list)){
4509                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4510                     mx += get_se_golomb(&s->gb);
4511                     my += get_se_golomb(&s->gb);
4512                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4513
4514                     val= pack16to32(mx,my);
4515                 }else
4516                     val=0;
4517                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4518             }
4519         }
4520         else if(IS_16X8(mb_type)){
4521             for(list=0; list<h->list_count; list++){
4522                     for(i=0; i<2; i++){
4523                         unsigned int val;
4524                         if(IS_DIR(mb_type, i, list)){
4525                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4526                             if(val >= h->ref_count[list]){
4527                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4528                                 return -1;
4529                             }
4530                         }else
4531                             val= LIST_NOT_USED&0xFF;
4532                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4533                     }
4534             }
4535             for(list=0; list<h->list_count; list++){
4536                 for(i=0; i<2; i++){
4537                     unsigned int val;
4538                     if(IS_DIR(mb_type, i, list)){
4539                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4540                         mx += get_se_golomb(&s->gb);
4541                         my += get_se_golomb(&s->gb);
4542                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4543
4544                         val= pack16to32(mx,my);
4545                     }else
4546                         val=0;
4547                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4548                 }
4549             }
4550         }else{
4551             assert(IS_8X16(mb_type));
4552             for(list=0; list<h->list_count; list++){
4553                     for(i=0; i<2; i++){
4554                         unsigned int val;
4555                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4556                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4557                             if(val >= h->ref_count[list]){
4558                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4559                                 return -1;
4560                             }
4561                         }else
4562                             val= LIST_NOT_USED&0xFF;
4563                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4564                     }
4565             }
4566             for(list=0; list<h->list_count; list++){
4567                 for(i=0; i<2; i++){
4568                     unsigned int val;
4569                     if(IS_DIR(mb_type, i, list)){
4570                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4571                         mx += get_se_golomb(&s->gb);
4572                         my += get_se_golomb(&s->gb);
4573                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4574
4575                         val= pack16to32(mx,my);
4576                     }else
4577                         val=0;
4578                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4579                 }
4580             }
4581         }
4582     }
4583
4584     if(IS_INTER(mb_type))
4585         write_back_motion(h, mb_type);
4586
4587     if(!IS_INTRA16x16(mb_type)){
4588         cbp= get_ue_golomb(&s->gb);
4589         if(cbp > 47){
4590             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4591             return -1;
4592         }
4593
4594         if(CHROMA){
4595             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4596             else                     cbp= golomb_to_inter_cbp   [cbp];
4597         }else{
4598             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4599             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4600         }
4601     }
4602     h->cbp = cbp;
4603
4604     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4605         if(get_bits1(&s->gb)){
4606             mb_type |= MB_TYPE_8x8DCT;
4607             h->cbp_table[mb_xy]= cbp;
4608         }
4609     }
4610     s->current_picture.mb_type[mb_xy]= mb_type;
4611
4612     if(cbp || IS_INTRA16x16(mb_type)){
4613         int i8x8, i4x4, chroma_idx;
4614         int dquant;
4615         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4616         const uint8_t *scan, *scan8x8, *dc_scan;
4617
4618 //        fill_non_zero_count_cache(h);
4619
4620         if(IS_INTERLACED(mb_type)){
4621             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4622             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4623             dc_scan= luma_dc_field_scan;
4624         }else{
4625             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4626             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4627             dc_scan= luma_dc_zigzag_scan;
4628         }
4629
4630         dquant= get_se_golomb(&s->gb);
4631
4632         if( dquant > 25 || dquant < -26 ){
4633             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4634             return -1;
4635         }
4636
4637         s->qscale += dquant;
4638         if(((unsigned)s->qscale) > 51){
4639             if(s->qscale<0) s->qscale+= 52;
4640             else            s->qscale-= 52;
4641         }
4642
4643         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4644         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4645         if(IS_INTRA16x16(mb_type)){
4646             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4647                 return -1; //FIXME continue if partitioned and other return -1 too
4648             }
4649
4650             assert((cbp&15) == 0 || (cbp&15) == 15);
4651
4652             if(cbp&15){
4653                 for(i8x8=0; i8x8<4; i8x8++){
4654                     for(i4x4=0; i4x4<4; i4x4++){
4655                         const int index= i4x4 + 4*i8x8;
4656                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4657                             return -1;
4658                         }
4659                     }
4660                 }
4661             }else{
4662                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4663             }
4664         }else{
4665             for(i8x8=0; i8x8<4; i8x8++){
4666                 if(cbp & (1<<i8x8)){
4667                     if(IS_8x8DCT(mb_type)){
4668                         DCTELEM *buf = &h->mb[64*i8x8];
4669                         uint8_t *nnz;
4670                         for(i4x4=0; i4x4<4; i4x4++){
4671                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4672                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4673                                 return -1;
4674                         }
4675                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4676                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4677                     }else{
4678                         for(i4x4=0; i4x4<4; i4x4++){
4679                             const int index= i4x4 + 4*i8x8;
4680
4681                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4682                                 return -1;
4683                             }
4684                         }
4685                     }
4686                 }else{
4687                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4688                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4689                 }
4690             }
4691         }
4692
4693         if(cbp&0x30){
4694             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4695                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4696                     return -1;
4697                 }
4698         }
4699
4700         if(cbp&0x20){
4701             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4702                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4703                 for(i4x4=0; i4x4<4; i4x4++){
4704                     const int index= 16 + 4*chroma_idx + i4x4;
4705                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4706                         return -1;
4707                     }
4708                 }
4709             }
4710         }else{
4711             uint8_t * const nnz= &h->non_zero_count_cache[0];
4712             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4713             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4714         }
4715     }else{
4716         uint8_t * const nnz= &h->non_zero_count_cache[0];
4717         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4718         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4719         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4720     }
4721     s->current_picture.qscale_table[mb_xy]= s->qscale;
4722     write_back_non_zero_count(h);
4723
4724     if(MB_MBAFF){
4725         h->ref_count[0] >>= 1;
4726         h->ref_count[1] >>= 1;
4727     }
4728
4729     return 0;
4730 }
4731
4732 static int decode_cabac_field_decoding_flag(H264Context *h) {
4733     MpegEncContext * const s = &h->s;
4734     const int mb_x = s->mb_x;
4735     const int mb_y = s->mb_y & ~1;
4736     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4737     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4738
4739     unsigned int ctx = 0;
4740
4741     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4742         ctx += 1;
4743     }
4744     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4745         ctx += 1;
4746     }
4747
4748     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4749 }
4750
4751 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4752     uint8_t *state= &h->cabac_state[ctx_base];
4753     int mb_type;
4754
4755     if(intra_slice){
4756         MpegEncContext * const s = &h->s;
4757         const int mba_xy = h->left_mb_xy[0];
4758         const int mbb_xy = h->top_mb_xy;
4759         int ctx=0;
4760         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4761             ctx++;
4762         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4763             ctx++;
4764         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4765             return 0;   /* I4x4 */
4766         state += 2;
4767     }else{
4768         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4769             return 0;   /* I4x4 */
4770     }
4771
4772     if( get_cabac_terminate( &h->cabac ) )
4773         return 25;  /* PCM */
4774
4775     mb_type = 1; /* I16x16 */
4776     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4777     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4778         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4779     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4780     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4781     return mb_type;
4782 }
4783
4784 static int decode_cabac_mb_type( H264Context *h ) {
4785     MpegEncContext * const s = &h->s;
4786
4787     if( h->slice_type_nos == FF_I_TYPE ) {
4788         return decode_cabac_intra_mb_type(h, 3, 1);
4789     } else if( h->slice_type_nos == FF_P_TYPE ) {
4790         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4791             /* P-type */
4792             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4793                 /* P_L0_D16x16, P_8x8 */
4794                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4795             } else {
4796                 /* P_L0_D8x16, P_L0_D16x8 */
4797                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4798             }
4799         } else {
4800             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4801         }
4802     } else if( h->slice_type_nos == FF_B_TYPE ) {
4803         const int mba_xy = h->left_mb_xy[0];
4804         const int mbb_xy = h->top_mb_xy;
4805         int ctx = 0;
4806         int bits;
4807
4808         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4809             ctx++;
4810         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4811             ctx++;
4812
4813         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4814             return 0; /* B_Direct_16x16 */
4815
4816         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4817             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4818         }
4819
4820         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4821         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4822         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4823         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4824         if( bits < 8 )
4825             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4826         else if( bits == 13 ) {
4827             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4828         } else if( bits == 14 )
4829             return 11; /* B_L1_L0_8x16 */
4830         else if( bits == 15 )
4831             return 22; /* B_8x8 */
4832
4833         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4834         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4835     } else {
4836         /* TODO SI/SP frames? */
4837         return -1;
4838     }
4839 }
4840
4841 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4842     MpegEncContext * const s = &h->s;
4843     int mba_xy, mbb_xy;
4844     int ctx = 0;
4845
4846     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4847         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4848         mba_xy = mb_xy - 1;
4849         if( (mb_y&1)
4850             && h->slice_table[mba_xy] == h->slice_num
4851             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4852             mba_xy += s->mb_stride;
4853         if( MB_FIELD ){
4854             mbb_xy = mb_xy - s->mb_stride;
4855             if( !(mb_y&1)
4856                 && h->slice_table[mbb_xy] == h->slice_num
4857                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4858                 mbb_xy -= s->mb_stride;
4859         }else
4860             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4861     }else{
4862         int mb_xy = h->mb_xy;
4863         mba_xy = mb_xy - 1;
4864         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4865     }
4866
4867     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4868         ctx++;
4869     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4870         ctx++;
4871
4872     if( h->slice_type_nos == FF_B_TYPE )
4873         ctx += 13;
4874     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4875 }
4876
4877 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4878     int mode = 0;
4879
4880     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4881         return pred_mode;
4882
4883     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4884     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4885     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4886
4887     if( mode >= pred_mode )
4888         return mode + 1;
4889     else
4890         return mode;
4891 }
4892
4893 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4894     const int mba_xy = h->left_mb_xy[0];
4895     const int mbb_xy = h->top_mb_xy;
4896
4897     int ctx = 0;
4898
4899     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4900     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4901         ctx++;
4902
4903     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4904         ctx++;
4905
4906     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4907         return 0;
4908
4909     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4910         return 1;
4911     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4912         return 2;
4913     else
4914         return 3;
4915 }
4916
4917 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4918     int cbp_b, cbp_a, ctx, cbp = 0;
4919
4920     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4921     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4922
4923     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4924     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4925     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4926     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4927     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4928     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4929     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4930     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4931     return cbp;
4932 }
4933 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4934     int ctx;
4935     int cbp_a, cbp_b;
4936
4937     cbp_a = (h->left_cbp>>4)&0x03;
4938     cbp_b = (h-> top_cbp>>4)&0x03;
4939
4940     ctx = 0;
4941     if( cbp_a > 0 ) ctx++;
4942     if( cbp_b > 0 ) ctx += 2;
4943     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4944         return 0;
4945
4946     ctx = 4;
4947     if( cbp_a == 2 ) ctx++;
4948     if( cbp_b == 2 ) ctx += 2;
4949     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4950 }
4951 static int decode_cabac_mb_dqp( H264Context *h) {
4952     int   ctx = 0;
4953     int   val = 0;
4954
4955     if( h->last_qscale_diff != 0 )
4956         ctx++;
4957
4958     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4959         if( ctx < 2 )
4960             ctx = 2;
4961         else
4962             ctx = 3;
4963         val++;
4964         if(val > 102) //prevent infinite loop
4965             return INT_MIN;
4966     }
4967
4968     if( val&0x01 )
4969         return (val + 1)/2;
4970     else
4971         return -(val + 1)/2;
4972 }
4973 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4974     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4975         return 0;   /* 8x8 */
4976     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4977         return 1;   /* 8x4 */
4978     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4979         return 2;   /* 4x8 */
4980     return 3;       /* 4x4 */
4981 }
4982 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4983     int type;
4984     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4985         return 0;   /* B_Direct_8x8 */
4986     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4987         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4988     type = 3;
4989     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4990         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4991             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4992         type += 4;
4993     }
4994     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4995     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4996     return type;
4997 }
4998
4999 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5000     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5001 }
5002
5003 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5004     int refa = h->ref_cache[list][scan8[n] - 1];
5005     int refb = h->ref_cache[list][scan8[n] - 8];
5006     int ref  = 0;
5007     int ctx  = 0;
5008
5009     if( h->slice_type_nos == FF_B_TYPE) {
5010         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5011             ctx++;
5012         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5013             ctx += 2;
5014     } else {
5015         if( refa > 0 )
5016             ctx++;
5017         if( refb > 0 )
5018             ctx += 2;
5019     }
5020
5021     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5022         ref++;
5023         if( ctx < 4 )
5024             ctx = 4;
5025         else
5026             ctx = 5;
5027         if(ref >= 32 /*h->ref_list[list]*/){
5028             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5029             return 0; //FIXME we should return -1 and check the return everywhere
5030         }
5031     }
5032     return ref;
5033 }
5034
5035 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5036     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5037                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5038     int ctxbase = (l == 0) ? 40 : 47;
5039     int ctx, mvd;
5040
5041     if( amvd < 3 )
5042         ctx = 0;
5043     else if( amvd > 32 )
5044         ctx = 2;
5045     else
5046         ctx = 1;
5047
5048     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5049         return 0;
5050
5051     mvd= 1;
5052     ctx= 3;
5053     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5054         mvd++;
5055         if( ctx < 6 )
5056             ctx++;
5057     }
5058
5059     if( mvd >= 9 ) {
5060         int k = 3;
5061         while( get_cabac_bypass( &h->cabac ) ) {
5062             mvd += 1 << k;
5063             k++;
5064             if(k>24){
5065                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5066                 return INT_MIN;
5067             }
5068         }
5069         while( k-- ) {
5070             if( get_cabac_bypass( &h->cabac ) )
5071                 mvd += 1 << k;
5072         }
5073     }
5074     return get_cabac_bypass_sign( &h->cabac, -mvd );
5075 }
5076
5077 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5078     int nza, nzb;
5079     int ctx = 0;
5080
5081     if( is_dc ) {
5082         if( cat == 0 ) {
5083             nza = h->left_cbp&0x100;
5084             nzb = h-> top_cbp&0x100;
5085         } else {
5086             nza = (h->left_cbp>>(6+idx))&0x01;
5087             nzb = (h-> top_cbp>>(6+idx))&0x01;
5088         }
5089     } else {
5090         if( cat == 4 ) {
5091             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5092             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5093         } else {
5094             assert(cat == 1 || cat == 2);
5095             nza = h->non_zero_count_cache[scan8[idx] - 1];
5096             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5097         }
5098     }
5099
5100     if( nza > 0 )
5101         ctx++;
5102
5103     if( nzb > 0 )
5104         ctx += 2;
5105
5106     return ctx + 4 * cat;
5107 }
5108
5109 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5110     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5111     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5112     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5113     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5114 };
5115
5116 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5117     static const int significant_coeff_flag_offset[2][6] = {
5118       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5119       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5120     };
5121     static const int last_coeff_flag_offset[2][6] = {
5122       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5123       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5124     };
5125     static const int coeff_abs_level_m1_offset[6] = {
5126         227+0, 227+10, 227+20, 227+30, 227+39, 426
5127     };
5128     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5129       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5130         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5131         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5132        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5133       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5134         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5135         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5136         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5137     };
5138     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5139      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5140      * map node ctx => cabac ctx for level=1 */
5141     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5142     /* map node ctx => cabac ctx for level>1 */
5143     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5144     static const uint8_t coeff_abs_level_transition[2][8] = {
5145     /* update node ctx after decoding a level=1 */
5146         { 1, 2, 3, 3, 4, 5, 6, 7 },
5147     /* update node ctx after decoding a level>1 */
5148         { 4, 4, 4, 4, 5, 6, 7, 7 }
5149     };
5150
5151     int index[64];
5152
5153     int av_unused last;
5154     int coeff_count = 0;
5155     int node_ctx = 0;
5156
5157     uint8_t *significant_coeff_ctx_base;
5158     uint8_t *last_coeff_ctx_base;
5159     uint8_t *abs_level_m1_ctx_base;
5160
5161 #ifndef ARCH_X86
5162 #define CABAC_ON_STACK
5163 #endif
5164 #ifdef CABAC_ON_STACK
5165 #define CC &cc
5166     CABACContext cc;
5167     cc.range     = h->cabac.range;
5168     cc.low       = h->cabac.low;
5169     cc.bytestream= h->cabac.bytestream;
5170 #else
5171 #define CC &h->cabac
5172 #endif
5173
5174
5175     /* cat: 0-> DC 16x16  n = 0
5176      *      1-> AC 16x16  n = luma4x4idx
5177      *      2-> Luma4x4   n = luma4x4idx
5178      *      3-> DC Chroma n = iCbCr
5179      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5180      *      5-> Luma8x8   n = 4 * luma8x8idx
5181      */
5182
5183     /* read coded block flag */
5184     if( is_dc || cat != 5 ) {
5185         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5186             if( !is_dc ) {
5187                 if( cat == 4 )
5188                     h->non_zero_count_cache[scan8[16+n]] = 0;
5189                 else
5190                     h->non_zero_count_cache[scan8[n]] = 0;
5191             }
5192
5193 #ifdef CABAC_ON_STACK
5194             h->cabac.range     = cc.range     ;
5195             h->cabac.low       = cc.low       ;
5196             h->cabac.bytestream= cc.bytestream;
5197 #endif
5198             return;
5199         }
5200     }
5201
5202     significant_coeff_ctx_base = h->cabac_state
5203         + significant_coeff_flag_offset[MB_FIELD][cat];
5204     last_coeff_ctx_base = h->cabac_state
5205         + last_coeff_flag_offset[MB_FIELD][cat];
5206     abs_level_m1_ctx_base = h->cabac_state
5207         + coeff_abs_level_m1_offset[cat];
5208
5209     if( !is_dc && cat == 5 ) {
5210 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5211         for(last= 0; last < coefs; last++) { \
5212             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5213             if( get_cabac( CC, sig_ctx )) { \
5214                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5215                 index[coeff_count++] = last; \
5216                 if( get_cabac( CC, last_ctx ) ) { \
5217                     last= max_coeff; \
5218                     break; \
5219                 } \
5220             } \
5221         }\
5222         if( last == max_coeff -1 ) {\
5223             index[coeff_count++] = last;\
5224         }
5225         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5226 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5227         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5228     } else {
5229         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5230 #else
5231         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5232     } else {
5233         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5234 #endif
5235     }
5236     assert(coeff_count > 0);
5237
5238     if( is_dc ) {
5239         if( cat == 0 )
5240             h->cbp_table[h->mb_xy] |= 0x100;
5241         else
5242             h->cbp_table[h->mb_xy] |= 0x40 << n;
5243     } else {
5244         if( cat == 5 )
5245             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5246         else if( cat == 4 )
5247             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5248         else {
5249             assert( cat == 1 || cat == 2 );
5250             h->non_zero_count_cache[scan8[n]] = coeff_count;
5251         }
5252     }
5253
5254     do {
5255         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5256
5257         int j= scantable[index[--coeff_count]];
5258
5259         if( get_cabac( CC, ctx ) == 0 ) {
5260             node_ctx = coeff_abs_level_transition[0][node_ctx];
5261             if( is_dc ) {
5262                 block[j] = get_cabac_bypass_sign( CC, -1);
5263             }else{
5264                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5265             }
5266         } else {
5267             int coeff_abs = 2;
5268             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5269             node_ctx = coeff_abs_level_transition[1][node_ctx];
5270
5271             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5272                 coeff_abs++;
5273             }
5274
5275             if( coeff_abs >= 15 ) {
5276                 int j = 0;
5277                 while( get_cabac_bypass( CC ) ) {
5278                     j++;
5279                 }
5280
5281                 coeff_abs=1;
5282                 while( j-- ) {
5283                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5284                 }
5285                 coeff_abs+= 14;
5286             }
5287
5288             if( is_dc ) {
5289                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5290             }else{
5291                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5292             }
5293         }
5294     } while( coeff_count );
5295 #ifdef CABAC_ON_STACK
5296             h->cabac.range     = cc.range     ;
5297             h->cabac.low       = cc.low       ;
5298             h->cabac.bytestream= cc.bytestream;
5299 #endif
5300
5301 }
5302
5303 #ifndef CONFIG_SMALL
5304 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5305     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5306 }
5307
5308 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5309     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5310 }
5311 #endif
5312
5313 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5314 #ifdef CONFIG_SMALL
5315     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5316 #else
5317     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5318     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5319 #endif
5320 }
5321
5322 static inline void compute_mb_neighbors(H264Context *h)
5323 {
5324     MpegEncContext * const s = &h->s;
5325     const int mb_xy  = h->mb_xy;
5326     h->top_mb_xy     = mb_xy - s->mb_stride;
5327     h->left_mb_xy[0] = mb_xy - 1;
5328     if(FRAME_MBAFF){
5329         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5330         const int top_pair_xy      = pair_xy     - s->mb_stride;
5331         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5332         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5333         const int curr_mb_frame_flag = !MB_FIELD;
5334         const int bottom = (s->mb_y & 1);
5335         if (bottom
5336                 ? !curr_mb_frame_flag // bottom macroblock
5337                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5338                 ) {
5339             h->top_mb_xy -= s->mb_stride;
5340         }
5341         if (left_mb_frame_flag != curr_mb_frame_flag) {
5342             h->left_mb_xy[0] = pair_xy - 1;
5343         }
5344     } else if (FIELD_PICTURE) {
5345         h->top_mb_xy -= s->mb_stride;
5346     }
5347     return;
5348 }
5349
5350 /**
5351  * decodes a macroblock
5352  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5353  */
5354 static int decode_mb_cabac(H264Context *h) {
5355     MpegEncContext * const s = &h->s;
5356     int mb_xy;
5357     int mb_type, partition_count, cbp = 0;
5358     int dct8x8_allowed= h->pps.transform_8x8_mode;
5359
5360     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5361
5362     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5363
5364     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5365     if( h->slice_type_nos != FF_I_TYPE ) {
5366         int skip;
5367         /* a skipped mb needs the aff flag from the following mb */
5368         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5369             predict_field_decoding_flag(h);
5370         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5371             skip = h->next_mb_skipped;
5372         else
5373             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5374         /* read skip flags */
5375         if( skip ) {
5376             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5377                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5378                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5379                 if(h->next_mb_skipped)
5380                     predict_field_decoding_flag(h);
5381                 else
5382                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5383             }
5384
5385             decode_mb_skip(h);
5386
5387             h->cbp_table[mb_xy] = 0;
5388             h->chroma_pred_mode_table[mb_xy] = 0;
5389             h->last_qscale_diff = 0;
5390
5391             return 0;
5392
5393         }
5394     }
5395     if(FRAME_MBAFF){
5396         if( (s->mb_y&1) == 0 )
5397             h->mb_mbaff =
5398             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5399     }
5400
5401     h->prev_mb_skipped = 0;
5402
5403     compute_mb_neighbors(h);
5404     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5405         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5406         return -1;
5407     }
5408
5409     if( h->slice_type_nos == FF_B_TYPE ) {
5410         if( mb_type < 23 ){
5411             partition_count= b_mb_type_info[mb_type].partition_count;
5412             mb_type=         b_mb_type_info[mb_type].type;
5413         }else{
5414             mb_type -= 23;
5415             goto decode_intra_mb;
5416         }
5417     } else if( h->slice_type_nos == FF_P_TYPE ) {
5418         if( mb_type < 5) {
5419             partition_count= p_mb_type_info[mb_type].partition_count;
5420             mb_type=         p_mb_type_info[mb_type].type;
5421         } else {
5422             mb_type -= 5;
5423             goto decode_intra_mb;
5424         }
5425     } else {
5426         if(h->slice_type == FF_SI_TYPE && mb_type)
5427             mb_type--;
5428         assert(h->slice_type_nos == FF_I_TYPE);
5429 decode_intra_mb:
5430         partition_count = 0;
5431         cbp= i_mb_type_info[mb_type].cbp;
5432         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5433         mb_type= i_mb_type_info[mb_type].type;
5434     }
5435     if(MB_FIELD)
5436         mb_type |= MB_TYPE_INTERLACED;
5437
5438     h->slice_table[ mb_xy ]= h->slice_num;
5439
5440     if(IS_INTRA_PCM(mb_type)) {
5441         const uint8_t *ptr;
5442
5443         // We assume these blocks are very rare so we do not optimize it.
5444         // FIXME The two following lines get the bitstream position in the cabac
5445         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5446         ptr= h->cabac.bytestream;
5447         if(h->cabac.low&0x1) ptr--;
5448         if(CABAC_BITS==16){
5449             if(h->cabac.low&0x1FF) ptr--;
5450         }
5451
5452         // The pixels are stored in the same order as levels in h->mb array.
5453         memcpy(h->mb, ptr, 256); ptr+=256;
5454         if(CHROMA){
5455             memcpy(h->mb+128, ptr, 128); ptr+=128;
5456         }
5457
5458         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5459
5460         // All blocks are present
5461         h->cbp_table[mb_xy] = 0x1ef;
5462         h->chroma_pred_mode_table[mb_xy] = 0;
5463         // In deblocking, the quantizer is 0
5464         s->current_picture.qscale_table[mb_xy]= 0;
5465         // All coeffs are present
5466         memset(h->non_zero_count[mb_xy], 16, 16);
5467         s->current_picture.mb_type[mb_xy]= mb_type;
5468         h->last_qscale_diff = 0;
5469         return 0;
5470     }
5471
5472     if(MB_MBAFF){
5473         h->ref_count[0] <<= 1;
5474         h->ref_count[1] <<= 1;
5475     }
5476
5477     fill_caches(h, mb_type, 0);
5478
5479     if( IS_INTRA( mb_type ) ) {
5480         int i, pred_mode;
5481         if( IS_INTRA4x4( mb_type ) ) {
5482             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5483                 mb_type |= MB_TYPE_8x8DCT;
5484                 for( i = 0; i < 16; i+=4 ) {
5485                     int pred = pred_intra_mode( h, i );
5486                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5487                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5488                 }
5489             } else {
5490                 for( i = 0; i < 16; i++ ) {
5491                     int pred = pred_intra_mode( h, i );
5492                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5493
5494                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5495                 }
5496             }
5497             write_back_intra_pred_mode(h);
5498             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5499         } else {
5500             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5501             if( h->intra16x16_pred_mode < 0 ) return -1;
5502         }
5503         if(CHROMA){
5504             h->chroma_pred_mode_table[mb_xy] =
5505             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5506
5507             pred_mode= check_intra_pred_mode( h, pred_mode );
5508             if( pred_mode < 0 ) return -1;
5509             h->chroma_pred_mode= pred_mode;
5510         }
5511     } else if( partition_count == 4 ) {
5512         int i, j, sub_partition_count[4], list, ref[2][4];
5513
5514         if( h->slice_type_nos == FF_B_TYPE ) {
5515             for( i = 0; i < 4; i++ ) {
5516                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5517                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5518                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5519             }
5520             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5521                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5522                 pred_direct_motion(h, &mb_type);
5523                 h->ref_cache[0][scan8[4]] =
5524                 h->ref_cache[1][scan8[4]] =
5525                 h->ref_cache[0][scan8[12]] =
5526                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5527                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5528                     for( i = 0; i < 4; i++ )
5529                         if( IS_DIRECT(h->sub_mb_type[i]) )
5530                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5531                 }
5532             }
5533         } else {
5534             for( i = 0; i < 4; i++ ) {
5535                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5536                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5537                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5538             }
5539         }
5540
5541         for( list = 0; list < h->list_count; list++ ) {
5542                 for( i = 0; i < 4; i++ ) {
5543                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5544                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5545                         if( h->ref_count[list] > 1 )
5546                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5547                         else
5548                             ref[list][i] = 0;
5549                     } else {
5550                         ref[list][i] = -1;
5551                     }
5552                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5553                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5554                 }
5555         }
5556
5557         if(dct8x8_allowed)
5558             dct8x8_allowed = get_dct8x8_allowed(h);
5559
5560         for(list=0; list<h->list_count; list++){
5561             for(i=0; i<4; i++){
5562                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5563                 if(IS_DIRECT(h->sub_mb_type[i])){
5564                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5565                     continue;
5566                 }
5567
5568                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5569                     const int sub_mb_type= h->sub_mb_type[i];
5570                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5571                     for(j=0; j<sub_partition_count[i]; j++){
5572                         int mpx, mpy;
5573                         int mx, my;
5574                         const int index= 4*i + block_width*j;
5575                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5576                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5577                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5578
5579                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5580                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5581                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5582
5583                         if(IS_SUB_8X8(sub_mb_type)){
5584                             mv_cache[ 1 ][0]=
5585                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5586                             mv_cache[ 1 ][1]=
5587                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5588
5589                             mvd_cache[ 1 ][0]=
5590                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5591                             mvd_cache[ 1 ][1]=
5592                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5593                         }else if(IS_SUB_8X4(sub_mb_type)){
5594                             mv_cache[ 1 ][0]= mx;
5595                             mv_cache[ 1 ][1]= my;
5596
5597                             mvd_cache[ 1 ][0]= mx - mpx;
5598                             mvd_cache[ 1 ][1]= my - mpy;
5599                         }else if(IS_SUB_4X8(sub_mb_type)){
5600                             mv_cache[ 8 ][0]= mx;
5601                             mv_cache[ 8 ][1]= my;
5602
5603                             mvd_cache[ 8 ][0]= mx - mpx;
5604                             mvd_cache[ 8 ][1]= my - mpy;
5605                         }
5606                         mv_cache[ 0 ][0]= mx;
5607                         mv_cache[ 0 ][1]= my;
5608
5609                         mvd_cache[ 0 ][0]= mx - mpx;
5610                         mvd_cache[ 0 ][1]= my - mpy;
5611                     }
5612                 }else{
5613                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5614                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5615                     p[0] = p[1] = p[8] = p[9] = 0;
5616                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5617                 }
5618             }
5619         }
5620     } else if( IS_DIRECT(mb_type) ) {
5621         pred_direct_motion(h, &mb_type);
5622         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5623         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5624         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5625     } else {
5626         int list, mx, my, i, mpx, mpy;
5627         if(IS_16X16(mb_type)){
5628             for(list=0; list<h->list_count; list++){
5629                 if(IS_DIR(mb_type, 0, list)){
5630                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5631                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5632                 }else
5633                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5634             }
5635             for(list=0; list<h->list_count; list++){
5636                 if(IS_DIR(mb_type, 0, list)){
5637                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5638
5639                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5640                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5641                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5642
5643                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5644                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5645                 }else
5646                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5647             }
5648         }
5649         else if(IS_16X8(mb_type)){
5650             for(list=0; list<h->list_count; list++){
5651                     for(i=0; i<2; i++){
5652                         if(IS_DIR(mb_type, i, list)){
5653                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5654                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5655                         }else
5656                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5657                     }
5658             }
5659             for(list=0; list<h->list_count; list++){
5660                 for(i=0; i<2; i++){
5661                     if(IS_DIR(mb_type, i, list)){
5662                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5663                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5664                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5665                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5666
5667                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5668                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5669                     }else{
5670                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5671                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5672                     }
5673                 }
5674             }
5675         }else{
5676             assert(IS_8X16(mb_type));
5677             for(list=0; list<h->list_count; list++){
5678                     for(i=0; i<2; i++){
5679                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5680                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5681                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5682                         }else
5683                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5684                     }
5685             }
5686             for(list=0; list<h->list_count; list++){
5687                 for(i=0; i<2; i++){
5688                     if(IS_DIR(mb_type, i, list)){
5689                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5690                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5691                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5692
5693                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5694                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5695                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5696                     }else{
5697                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5698                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5699                     }
5700                 }
5701             }
5702         }
5703     }
5704
5705    if( IS_INTER( mb_type ) ) {
5706         h->chroma_pred_mode_table[mb_xy] = 0;
5707         write_back_motion( h, mb_type );
5708    }
5709
5710     if( !IS_INTRA16x16( mb_type ) ) {
5711         cbp  = decode_cabac_mb_cbp_luma( h );
5712         if(CHROMA)
5713             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5714     }
5715
5716     h->cbp_table[mb_xy] = h->cbp = cbp;
5717
5718     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5719         if( decode_cabac_mb_transform_size( h ) )
5720             mb_type |= MB_TYPE_8x8DCT;
5721     }
5722     s->current_picture.mb_type[mb_xy]= mb_type;
5723
5724     if( cbp || IS_INTRA16x16( mb_type ) ) {
5725         const uint8_t *scan, *scan8x8, *dc_scan;
5726         const uint32_t *qmul;
5727         int dqp;
5728
5729         if(IS_INTERLACED(mb_type)){
5730             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5731             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5732             dc_scan= luma_dc_field_scan;
5733         }else{
5734             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5735             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5736             dc_scan= luma_dc_zigzag_scan;
5737         }
5738
5739         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5740         if( dqp == INT_MIN ){
5741             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5742             return -1;
5743         }
5744         s->qscale += dqp;
5745         if(((unsigned)s->qscale) > 51){
5746             if(s->qscale<0) s->qscale+= 52;
5747             else            s->qscale-= 52;
5748         }
5749         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5750         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5751
5752         if( IS_INTRA16x16( mb_type ) ) {
5753             int i;
5754             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5755             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5756
5757             if( cbp&15 ) {
5758                 qmul = h->dequant4_coeff[0][s->qscale];
5759                 for( i = 0; i < 16; i++ ) {
5760                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5761                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5762                 }
5763             } else {
5764                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5765             }
5766         } else {
5767             int i8x8, i4x4;
5768             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5769                 if( cbp & (1<<i8x8) ) {
5770                     if( IS_8x8DCT(mb_type) ) {
5771                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5772                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5773                     } else {
5774                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5775                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5776                             const int index = 4*i8x8 + i4x4;
5777                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5778 //START_TIMER
5779                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5780 //STOP_TIMER("decode_residual")
5781                         }
5782                     }
5783                 } else {
5784                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5785                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5786                 }
5787             }
5788         }
5789
5790         if( cbp&0x30 ){
5791             int c;
5792             for( c = 0; c < 2; c++ ) {
5793                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5794                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5795             }
5796         }
5797
5798         if( cbp&0x20 ) {
5799             int c, i;
5800             for( c = 0; c < 2; c++ ) {
5801                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5802                 for( i = 0; i < 4; i++ ) {
5803                     const int index = 16 + 4 * c + i;
5804                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5805                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5806                 }
5807             }
5808         } else {
5809             uint8_t * const nnz= &h->non_zero_count_cache[0];
5810             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5811             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5812         }
5813     } else {
5814         uint8_t * const nnz= &h->non_zero_count_cache[0];
5815         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5816         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5817         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5818         h->last_qscale_diff = 0;
5819     }
5820
5821     s->current_picture.qscale_table[mb_xy]= s->qscale;
5822     write_back_non_zero_count(h);
5823
5824     if(MB_MBAFF){
5825         h->ref_count[0] >>= 1;
5826         h->ref_count[1] >>= 1;
5827     }
5828
5829     return 0;
5830 }
5831
5832
5833 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5834     int i, d;
5835     const int index_a = qp + h->slice_alpha_c0_offset;
5836     const int alpha = (alpha_table+52)[index_a];
5837     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5838
5839     if( bS[0] < 4 ) {
5840         int8_t tc[4];
5841         for(i=0; i<4; i++)
5842             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5843         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5844     } else {
5845         /* 16px edge length, because bS=4 is triggered by being at
5846          * the edge of an intra MB, so all 4 bS are the same */
5847             for( d = 0; d < 16; d++ ) {
5848                 const int p0 = pix[-1];
5849                 const int p1 = pix[-2];
5850                 const int p2 = pix[-3];
5851
5852                 const int q0 = pix[0];
5853                 const int q1 = pix[1];
5854                 const int q2 = pix[2];
5855
5856                 if( FFABS( p0 - q0 ) < alpha &&
5857                     FFABS( p1 - p0 ) < beta &&
5858                     FFABS( q1 - q0 ) < beta ) {
5859
5860                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5861                         if( FFABS( p2 - p0 ) < beta)
5862                         {
5863                             const int p3 = pix[-4];
5864                             /* p0', p1', p2' */
5865                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5866                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5867                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5868                         } else {
5869                             /* p0' */
5870                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5871                         }
5872                         if( FFABS( q2 - q0 ) < beta)
5873                         {
5874                             const int q3 = pix[3];
5875                             /* q0', q1', q2' */
5876                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5877                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5878                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5879                         } else {
5880                             /* q0' */
5881                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5882                         }
5883                     }else{
5884                         /* p0', q0' */
5885                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5886                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5887                     }
5888                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5889                 }
5890                 pix += stride;
5891             }
5892     }
5893 }
5894 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5895     int i;
5896     const int index_a = qp + h->slice_alpha_c0_offset;
5897     const int alpha = (alpha_table+52)[index_a];
5898     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5899
5900     if( bS[0] < 4 ) {
5901         int8_t tc[4];
5902         for(i=0; i<4; i++)
5903             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5904         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5905     } else {
5906         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5907     }
5908 }
5909
5910 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5911     int i;
5912     for( i = 0; i < 16; i++, pix += stride) {
5913         int index_a;
5914         int alpha;
5915         int beta;
5916
5917         int qp_index;
5918         int bS_index = (i >> 1);
5919         if (!MB_FIELD) {
5920             bS_index &= ~1;
5921             bS_index |= (i & 1);
5922         }
5923
5924         if( bS[bS_index] == 0 ) {
5925             continue;
5926         }
5927
5928         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5929         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5930         alpha = (alpha_table+52)[index_a];
5931         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5932
5933         if( bS[bS_index] < 4 ) {
5934             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5935             const int p0 = pix[-1];
5936             const int p1 = pix[-2];
5937             const int p2 = pix[-3];
5938             const int q0 = pix[0];
5939             const int q1 = pix[1];
5940             const int q2 = pix[2];
5941
5942             if( FFABS( p0 - q0 ) < alpha &&
5943                 FFABS( p1 - p0 ) < beta &&
5944                 FFABS( q1 - q0 ) < beta ) {
5945                 int tc = tc0;
5946                 int i_delta;
5947
5948                 if( FFABS( p2 - p0 ) < beta ) {
5949                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5950                     tc++;
5951                 }
5952                 if( FFABS( q2 - q0 ) < beta ) {
5953                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5954                     tc++;
5955                 }
5956
5957                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5958                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5959                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5960                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5961             }
5962         }else{
5963             const int p0 = pix[-1];
5964             const int p1 = pix[-2];
5965             const int p2 = pix[-3];
5966
5967             const int q0 = pix[0];
5968             const int q1 = pix[1];
5969             const int q2 = pix[2];
5970
5971             if( FFABS( p0 - q0 ) < alpha &&
5972                 FFABS( p1 - p0 ) < beta &&
5973                 FFABS( q1 - q0 ) < beta ) {
5974
5975                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5976                     if( FFABS( p2 - p0 ) < beta)
5977                     {
5978                         const int p3 = pix[-4];
5979                         /* p0', p1', p2' */
5980                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5981                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5982                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5983                     } else {
5984                         /* p0' */
5985                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5986                     }
5987                     if( FFABS( q2 - q0 ) < beta)
5988                     {
5989                         const int q3 = pix[3];
5990                         /* q0', q1', q2' */
5991                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5992                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5993                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5994                     } else {
5995                         /* q0' */
5996                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5997                     }
5998                 }else{
5999                     /* p0', q0' */
6000                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6001                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6002                 }
6003                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6004             }
6005         }
6006     }
6007 }
6008 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6009     int i;
6010     for( i = 0; i < 8; i++, pix += stride) {
6011         int index_a;
6012         int alpha;
6013         int beta;
6014
6015         int qp_index;
6016         int bS_index = i;
6017
6018         if( bS[bS_index] == 0 ) {
6019             continue;
6020         }
6021
6022         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6023         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6024         alpha = (alpha_table+52)[index_a];
6025         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6026
6027         if( bS[bS_index] < 4 ) {
6028             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6029             const int p0 = pix[-1];
6030             const int p1 = pix[-2];
6031             const int q0 = pix[0];
6032             const int q1 = pix[1];
6033
6034             if( FFABS( p0 - q0 ) < alpha &&
6035                 FFABS( p1 - p0 ) < beta &&
6036                 FFABS( q1 - q0 ) < beta ) {
6037                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6038
6039                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6040                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6041                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6042             }
6043         }else{
6044             const int p0 = pix[-1];
6045             const int p1 = pix[-2];
6046             const int q0 = pix[0];
6047             const int q1 = pix[1];
6048
6049             if( FFABS( p0 - q0 ) < alpha &&
6050                 FFABS( p1 - p0 ) < beta &&
6051                 FFABS( q1 - q0 ) < beta ) {
6052
6053                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6054                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6055                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6056             }
6057         }
6058     }
6059 }
6060
6061 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6062     int i, d;
6063     const int index_a = qp + h->slice_alpha_c0_offset;
6064     const int alpha = (alpha_table+52)[index_a];
6065     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6066     const int pix_next  = stride;
6067
6068     if( bS[0] < 4 ) {
6069         int8_t tc[4];
6070         for(i=0; i<4; i++)
6071             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6072         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6073     } else {
6074         /* 16px edge length, see filter_mb_edgev */
6075             for( d = 0; d < 16; d++ ) {
6076                 const int p0 = pix[-1*pix_next];
6077                 const int p1 = pix[-2*pix_next];
6078                 const int p2 = pix[-3*pix_next];
6079                 const int q0 = pix[0];
6080                 const int q1 = pix[1*pix_next];
6081                 const int q2 = pix[2*pix_next];
6082
6083                 if( FFABS( p0 - q0 ) < alpha &&
6084                     FFABS( p1 - p0 ) < beta &&
6085                     FFABS( q1 - q0 ) < beta ) {
6086
6087                     const int p3 = pix[-4*pix_next];
6088                     const int q3 = pix[ 3*pix_next];
6089
6090                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6091                         if( FFABS( p2 - p0 ) < beta) {
6092                             /* p0', p1', p2' */
6093                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6094                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6095                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6096                         } else {
6097                             /* p0' */
6098                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6099                         }
6100                         if( FFABS( q2 - q0 ) < beta) {
6101                             /* q0', q1', q2' */
6102                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6103                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6104                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6105                         } else {
6106                             /* q0' */
6107                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6108                         }
6109                     }else{
6110                         /* p0', q0' */
6111                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6112                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6113                     }
6114                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6115                 }
6116                 pix++;
6117             }
6118     }
6119 }
6120
6121 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6122     int i;
6123     const int index_a = qp + h->slice_alpha_c0_offset;
6124     const int alpha = (alpha_table+52)[index_a];
6125     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6126
6127     if( bS[0] < 4 ) {
6128         int8_t tc[4];
6129         for(i=0; i<4; i++)
6130             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6131         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6132     } else {
6133         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6134     }
6135 }
6136
6137 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6138     MpegEncContext * const s = &h->s;
6139     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6140     int mb_xy, mb_type;
6141     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6142
6143     mb_xy = h->mb_xy;
6144
6145     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6146         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6147        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6148                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6149         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6150         return;
6151     }
6152     assert(!FRAME_MBAFF);
6153
6154     mb_type = s->current_picture.mb_type[mb_xy];
6155     qp = s->current_picture.qscale_table[mb_xy];
6156     qp0 = s->current_picture.qscale_table[mb_xy-1];
6157     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6158     qpc = get_chroma_qp( h, 0, qp );
6159     qpc0 = get_chroma_qp( h, 0, qp0 );
6160     qpc1 = get_chroma_qp( h, 0, qp1 );
6161     qp0 = (qp + qp0 + 1) >> 1;
6162     qp1 = (qp + qp1 + 1) >> 1;
6163     qpc0 = (qpc + qpc0 + 1) >> 1;
6164     qpc1 = (qpc + qpc1 + 1) >> 1;
6165     qp_thresh = 15 - h->slice_alpha_c0_offset;
6166     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6167        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6168         return;
6169
6170     if( IS_INTRA(mb_type) ) {
6171         int16_t bS4[4] = {4,4,4,4};
6172         int16_t bS3[4] = {3,3,3,3};
6173         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6174         if( IS_8x8DCT(mb_type) ) {
6175             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6176             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6177             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6178             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6179         } else {
6180             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6181             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6182             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6183             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6184             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6185             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6186             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6187             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6188         }
6189         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6190         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6191         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6192         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6193         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6194         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6195         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6196         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6197         return;
6198     } else {
6199         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6200         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6201         int edges;
6202         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6203             edges = 4;
6204             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6205         } else {
6206             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6207                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6208             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6209                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6210                              ? 3 : 0;
6211             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6212             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6213             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6214                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6215         }
6216         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6217             bSv[0][0] = 0x0004000400040004ULL;
6218         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6219             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6220
6221 #define FILTER(hv,dir,edge)\
6222         if(bSv[dir][edge]) {\
6223             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6224             if(!(edge&1)) {\
6225                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6226                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6227             }\
6228         }
6229         if( edges == 1 ) {
6230             FILTER(v,0,0);
6231             FILTER(h,1,0);
6232         } else if( IS_8x8DCT(mb_type) ) {
6233             FILTER(v,0,0);
6234             FILTER(v,0,2);
6235             FILTER(h,1,0);
6236             FILTER(h,1,2);
6237         } else {
6238             FILTER(v,0,0);
6239             FILTER(v,0,1);
6240             FILTER(v,0,2);
6241             FILTER(v,0,3);
6242             FILTER(h,1,0);
6243             FILTER(h,1,1);
6244             FILTER(h,1,2);
6245             FILTER(h,1,3);
6246         }
6247 #undef FILTER
6248     }
6249 }
6250
6251
6252 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6253     MpegEncContext * const s = &h->s;
6254     int edge;
6255     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6256     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6257     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6258     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6259     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6260
6261     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6262                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6263     // how often to recheck mv-based bS when iterating between edges
6264     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6265                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6266     // how often to recheck mv-based bS when iterating along each edge
6267     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6268
6269     if (first_vertical_edge_done) {
6270         start = 1;
6271     }
6272
6273     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6274         start = 1;
6275
6276     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6277         && !IS_INTERLACED(mb_type)
6278         && IS_INTERLACED(mbm_type)
6279         ) {
6280         // This is a special case in the norm where the filtering must
6281         // be done twice (one each of the field) even if we are in a
6282         // frame macroblock.
6283         //
6284         static const int nnz_idx[4] = {4,5,6,3};
6285         unsigned int tmp_linesize   = 2 *   linesize;
6286         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6287         int mbn_xy = mb_xy - 2 * s->mb_stride;
6288         int qp;
6289         int i, j;
6290         int16_t bS[4];
6291
6292         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6293             if( IS_INTRA(mb_type) ||
6294                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6295                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6296             } else {
6297                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6298                 for( i = 0; i < 4; i++ ) {
6299                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6300                         mbn_nnz[nnz_idx[i]] != 0 )
6301                         bS[i] = 2;
6302                     else
6303                         bS[i] = 1;
6304                 }
6305             }
6306             // Do not use s->qscale as luma quantizer because it has not the same
6307             // value in IPCM macroblocks.
6308             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6309             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6310             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6311             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6312             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6313                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6314             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6315                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6316         }
6317
6318         start = 1;
6319     }
6320
6321     /* Calculate bS */
6322     for( edge = start; edge < edges; edge++ ) {
6323         /* mbn_xy: neighbor macroblock */
6324         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6325         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6326         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6327         int16_t bS[4];
6328         int qp;
6329
6330         if( (edge&1) && IS_8x8DCT(mb_type) )
6331             continue;
6332
6333         if( IS_INTRA(mb_type) ||
6334             IS_INTRA(mbn_type) ) {
6335             int value;
6336             if (edge == 0) {
6337                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6338                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6339                 ) {
6340                     value = 4;
6341                 } else {
6342                     value = 3;
6343                 }
6344             } else {
6345                 value = 3;
6346             }
6347             bS[0] = bS[1] = bS[2] = bS[3] = value;
6348         } else {
6349             int i, l;
6350             int mv_done;
6351
6352             if( edge & mask_edge ) {
6353                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6354                 mv_done = 1;
6355             }
6356             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6357                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6358                 mv_done = 1;
6359             }
6360             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6361                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6362                 int bn_idx= b_idx - (dir ? 8:1);
6363                 int v = 0;
6364
6365                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6366                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6367                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6368                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6369                 }
6370
6371                 if(h->slice_type_nos == FF_B_TYPE && v){
6372                     v=0;
6373                     for( l = 0; !v && l < 2; l++ ) {
6374                         int ln= 1-l;
6375                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6376                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6377                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6378                     }
6379                 }
6380
6381                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6382                 mv_done = 1;
6383             }
6384             else
6385                 mv_done = 0;
6386
6387             for( i = 0; i < 4; i++ ) {
6388                 int x = dir == 0 ? edge : i;
6389                 int y = dir == 0 ? i    : edge;
6390                 int b_idx= 8 + 4 + x + 8*y;
6391                 int bn_idx= b_idx - (dir ? 8:1);
6392
6393                 if( h->non_zero_count_cache[b_idx] |
6394                     h->non_zero_count_cache[bn_idx] ) {
6395                     bS[i] = 2;
6396                 }
6397                 else if(!mv_done)
6398                 {
6399                     bS[i] = 0;
6400                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6401                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6402                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6403                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6404                             bS[i] = 1;
6405                             break;
6406                         }
6407                     }
6408
6409                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6410                         bS[i] = 0;
6411                         for( l = 0; l < 2; l++ ) {
6412                             int ln= 1-l;
6413                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6414                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6415                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6416                                 bS[i] = 1;
6417                                 break;
6418                             }
6419                         }
6420                     }
6421                 }
6422             }
6423
6424             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6425                 continue;
6426         }
6427
6428         /* Filter edge */
6429         // Do not use s->qscale as luma quantizer because it has not the same
6430         // value in IPCM macroblocks.
6431         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6432         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6433         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6434         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6435         if( dir == 0 ) {
6436             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6437             if( (edge&1) == 0 ) {
6438                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6439                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6440                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6441                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6442             }
6443         } else {
6444             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6445             if( (edge&1) == 0 ) {
6446                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6447                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6448                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6449                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6450             }
6451         }
6452     }
6453 }
6454
6455 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6456     MpegEncContext * const s = &h->s;
6457     const int mb_xy= mb_x + mb_y*s->mb_stride;
6458     const int mb_type = s->current_picture.mb_type[mb_xy];
6459     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6460     int first_vertical_edge_done = 0;
6461     int dir;
6462
6463     //for sufficiently low qp, filtering wouldn't do anything
6464     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6465     if(!FRAME_MBAFF){
6466         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6467         int qp = s->current_picture.qscale_table[mb_xy];
6468         if(qp <= qp_thresh
6469            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6470            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6471             return;
6472         }
6473     }
6474
6475     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6476     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6477         int top_type, left_type[2];
6478         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6479         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6480         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6481
6482         if(IS_8x8DCT(top_type)){
6483             h->non_zero_count_cache[4+8*0]=
6484             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6485             h->non_zero_count_cache[6+8*0]=
6486             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6487         }
6488         if(IS_8x8DCT(left_type[0])){
6489             h->non_zero_count_cache[3+8*1]=
6490             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6491         }
6492         if(IS_8x8DCT(left_type[1])){
6493             h->non_zero_count_cache[3+8*3]=
6494             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6495         }
6496
6497         if(IS_8x8DCT(mb_type)){
6498             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6499             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6500
6501             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6502             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6503
6504             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6505             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6506
6507             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6508             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6509         }
6510     }
6511
6512     if (FRAME_MBAFF
6513             // left mb is in picture
6514             && h->slice_table[mb_xy-1] != 0xFFFF
6515             // and current and left pair do not have the same interlaced type
6516             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6517             // and left mb is in the same slice if deblocking_filter == 2
6518             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6519         /* First vertical edge is different in MBAFF frames
6520          * There are 8 different bS to compute and 2 different Qp
6521          */
6522         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6523         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6524         int16_t bS[8];
6525         int qp[2];
6526         int bqp[2];
6527         int rqp[2];
6528         int mb_qp, mbn0_qp, mbn1_qp;
6529         int i;
6530         first_vertical_edge_done = 1;
6531
6532         if( IS_INTRA(mb_type) )
6533             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6534         else {
6535             for( i = 0; i < 8; i++ ) {
6536                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6537
6538                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6539                     bS[i] = 4;
6540                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6541                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6542                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6543                                                                        :
6544                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6545                     bS[i] = 2;
6546                 else
6547                     bS[i] = 1;
6548             }
6549         }
6550
6551         mb_qp = s->current_picture.qscale_table[mb_xy];
6552         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6553         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6554         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6555         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6556                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6557         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6558                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6559         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6560         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6561                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6562         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6563                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6564
6565         /* Filter edge */
6566         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6567         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6568         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6569         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6570         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6571     }
6572
6573 #ifdef CONFIG_SMALL
6574     for( dir = 0; dir < 2; dir++ )
6575         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6576 #else
6577     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6578     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6579 #endif
6580 }
6581
6582 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6583     H264Context *h = *(void**)arg;
6584     MpegEncContext * const s = &h->s;
6585     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6586
6587     s->mb_skip_run= -1;
6588
6589     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6590                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6591
6592     if( h->pps.cabac ) {
6593         int i;
6594
6595         /* realign */
6596         align_get_bits( &s->gb );
6597
6598         /* init cabac */
6599         ff_init_cabac_states( &h->cabac);
6600         ff_init_cabac_decoder( &h->cabac,
6601                                s->gb.buffer + get_bits_count(&s->gb)/8,
6602                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6603         /* calculate pre-state */
6604         for( i= 0; i < 460; i++ ) {
6605             int pre;
6606             if( h->slice_type_nos == FF_I_TYPE )
6607                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6608             else
6609                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6610
6611             if( pre <= 63 )
6612                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6613             else
6614                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6615         }
6616
6617         for(;;){
6618 //START_TIMER
6619             int ret = decode_mb_cabac(h);
6620             int eos;
6621 //STOP_TIMER("decode_mb_cabac")
6622
6623             if(ret>=0) hl_decode_mb(h);
6624
6625             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6626                 s->mb_y++;
6627
6628                 if(ret>=0) ret = decode_mb_cabac(h);
6629
6630                 if(ret>=0) hl_decode_mb(h);
6631                 s->mb_y--;
6632             }
6633             eos = get_cabac_terminate( &h->cabac );
6634
6635             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6636                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6637                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6638                 return -1;
6639             }
6640
6641             if( ++s->mb_x >= s->mb_width ) {
6642                 s->mb_x = 0;
6643                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6644                 ++s->mb_y;
6645                 if(FIELD_OR_MBAFF_PICTURE) {
6646                     ++s->mb_y;
6647                 }
6648             }
6649
6650             if( eos || s->mb_y >= s->mb_height ) {
6651                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6652                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6653                 return 0;
6654             }
6655         }
6656
6657     } else {
6658         for(;;){
6659             int ret = decode_mb_cavlc(h);
6660
6661             if(ret>=0) hl_decode_mb(h);
6662
6663             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6664                 s->mb_y++;
6665                 ret = decode_mb_cavlc(h);
6666
6667                 if(ret>=0) hl_decode_mb(h);
6668                 s->mb_y--;
6669             }
6670
6671             if(ret<0){
6672                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6673                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6674
6675                 return -1;
6676             }
6677
6678             if(++s->mb_x >= s->mb_width){
6679                 s->mb_x=0;
6680                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6681                 ++s->mb_y;
6682                 if(FIELD_OR_MBAFF_PICTURE) {
6683                     ++s->mb_y;
6684                 }
6685                 if(s->mb_y >= s->mb_height){
6686                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6687
6688                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6689                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6690
6691                         return 0;
6692                     }else{
6693                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6694
6695                         return -1;
6696                     }
6697                 }
6698             }
6699
6700             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6701                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6702                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6703                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6704
6705                     return 0;
6706                 }else{
6707                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6708
6709                     return -1;
6710                 }
6711             }
6712         }
6713     }
6714
6715 #if 0
6716     for(;s->mb_y < s->mb_height; s->mb_y++){
6717         for(;s->mb_x < s->mb_width; s->mb_x++){
6718             int ret= decode_mb(h);
6719
6720             hl_decode_mb(h);
6721
6722             if(ret<0){
6723                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6724                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6725
6726                 return -1;
6727             }
6728
6729             if(++s->mb_x >= s->mb_width){
6730                 s->mb_x=0;
6731                 if(++s->mb_y >= s->mb_height){
6732                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6733                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6734
6735                         return 0;
6736                     }else{
6737                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6738
6739                         return -1;
6740                     }
6741                 }
6742             }
6743
6744             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6745                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6746                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6747
6748                     return 0;
6749                 }else{
6750                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6751
6752                     return -1;
6753                 }
6754             }
6755         }
6756         s->mb_x=0;
6757         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6758     }
6759 #endif
6760     return -1; //not reached
6761 }
6762
6763 static int decode_picture_timing(H264Context *h){
6764     MpegEncContext * const s = &h->s;
6765     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6766         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6767         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6768     }
6769     if(h->sps.pic_struct_present_flag){
6770         unsigned int i, num_clock_ts;
6771         h->sei_pic_struct = get_bits(&s->gb, 4);
6772
6773         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6774             return -1;
6775
6776         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6777
6778         for (i = 0 ; i < num_clock_ts ; i++){
6779             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6780                 unsigned int full_timestamp_flag;
6781                 skip_bits(&s->gb, 2);                 /* ct_type */
6782                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6783                 skip_bits(&s->gb, 5);                 /* counting_type */
6784                 full_timestamp_flag = get_bits(&s->gb, 1);
6785                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6786                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6787                 skip_bits(&s->gb, 8);                 /* n_frames */
6788                 if(full_timestamp_flag){
6789                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6790                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6791                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6792                 }else{
6793                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6794                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6795                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6796                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6797                             if(get_bits(&s->gb, 1))   /* hours_flag */
6798                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6799                         }
6800                     }
6801                 }
6802                 if(h->sps.time_offset_length > 0)
6803                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6804             }
6805         }
6806     }
6807     return 0;
6808 }
6809
6810 static int decode_unregistered_user_data(H264Context *h, int size){
6811     MpegEncContext * const s = &h->s;
6812     uint8_t user_data[16+256];
6813     int e, build, i;
6814
6815     if(size<16)
6816         return -1;
6817
6818     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6819         user_data[i]= get_bits(&s->gb, 8);
6820     }
6821
6822     user_data[i]= 0;
6823     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6824     if(e==1 && build>=0)
6825         h->x264_build= build;
6826
6827     if(s->avctx->debug & FF_DEBUG_BUGS)
6828         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6829
6830     for(; i<size; i++)
6831         skip_bits(&s->gb, 8);
6832
6833     return 0;
6834 }
6835
6836 static int decode_sei(H264Context *h){
6837     MpegEncContext * const s = &h->s;
6838
6839     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6840         int size, type;
6841
6842         type=0;
6843         do{
6844             type+= show_bits(&s->gb, 8);
6845         }while(get_bits(&s->gb, 8) == 255);
6846
6847         size=0;
6848         do{
6849             size+= show_bits(&s->gb, 8);
6850         }while(get_bits(&s->gb, 8) == 255);
6851
6852         switch(type){
6853         case 1: // Picture timing SEI
6854             if(decode_picture_timing(h) < 0)
6855                 return -1;
6856             break;
6857         case 5:
6858             if(decode_unregistered_user_data(h, size) < 0)
6859                 return -1;
6860             break;
6861         default:
6862             skip_bits(&s->gb, 8*size);
6863         }
6864
6865         //FIXME check bits here
6866         align_get_bits(&s->gb);
6867     }
6868
6869     return 0;
6870 }
6871
6872 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6873     MpegEncContext * const s = &h->s;
6874     int cpb_count, i;
6875     cpb_count = get_ue_golomb(&s->gb) + 1;
6876
6877     if(cpb_count > 32U){
6878         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6879         return -1;
6880     }
6881
6882     get_bits(&s->gb, 4); /* bit_rate_scale */
6883     get_bits(&s->gb, 4); /* cpb_size_scale */
6884     for(i=0; i<cpb_count; i++){
6885         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6886         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6887         get_bits1(&s->gb);     /* cbr_flag */
6888     }
6889     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6890     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6891     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6892     sps->time_offset_length = get_bits(&s->gb, 5);
6893     return 0;
6894 }
6895
6896 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6897     MpegEncContext * const s = &h->s;
6898     int aspect_ratio_info_present_flag;
6899     unsigned int aspect_ratio_idc;
6900
6901     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6902
6903     if( aspect_ratio_info_present_flag ) {
6904         aspect_ratio_idc= get_bits(&s->gb, 8);
6905         if( aspect_ratio_idc == EXTENDED_SAR ) {
6906             sps->sar.num= get_bits(&s->gb, 16);
6907             sps->sar.den= get_bits(&s->gb, 16);
6908         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6909             sps->sar=  pixel_aspect[aspect_ratio_idc];
6910         }else{
6911             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6912             return -1;
6913         }
6914     }else{
6915         sps->sar.num=
6916         sps->sar.den= 0;
6917     }
6918 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6919
6920     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6921         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6922     }
6923
6924     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6925         get_bits(&s->gb, 3);    /* video_format */
6926         get_bits1(&s->gb);      /* video_full_range_flag */
6927         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6928             get_bits(&s->gb, 8); /* colour_primaries */
6929             get_bits(&s->gb, 8); /* transfer_characteristics */
6930             get_bits(&s->gb, 8); /* matrix_coefficients */
6931         }
6932     }
6933
6934     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6935         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6936         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6937     }
6938
6939     sps->timing_info_present_flag = get_bits1(&s->gb);
6940     if(sps->timing_info_present_flag){
6941         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6942         sps->time_scale = get_bits_long(&s->gb, 32);
6943         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6944     }
6945
6946     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6947     if(sps->nal_hrd_parameters_present_flag)
6948         if(decode_hrd_parameters(h, sps) < 0)
6949             return -1;
6950     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6951     if(sps->vcl_hrd_parameters_present_flag)
6952         if(decode_hrd_parameters(h, sps) < 0)
6953             return -1;
6954     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6955         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6956     sps->pic_struct_present_flag = get_bits1(&s->gb);
6957
6958     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6959     if(sps->bitstream_restriction_flag){
6960         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6961         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6962         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6963         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6964         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6965         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6966         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6967
6968         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6969             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6970             return -1;
6971         }
6972     }
6973
6974     return 0;
6975 }
6976
6977 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6978                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6979     MpegEncContext * const s = &h->s;
6980     int i, last = 8, next = 8;
6981     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6982     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6983         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6984     else
6985     for(i=0;i<size;i++){
6986         if(next)
6987             next = (last + get_se_golomb(&s->gb)) & 0xff;
6988         if(!i && !next){ /* matrix not written, we use the preset one */
6989             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6990             break;
6991         }
6992         last = factors[scan[i]] = next ? next : last;
6993     }
6994 }
6995
6996 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6997                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6998     MpegEncContext * const s = &h->s;
6999     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7000     const uint8_t *fallback[4] = {
7001         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7002         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7003         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7004         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7005     };
7006     if(get_bits1(&s->gb)){
7007         sps->scaling_matrix_present |= is_sps;
7008         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7009         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7010         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7011         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7012         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7013         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7014         if(is_sps || pps->transform_8x8_mode){
7015             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7016             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7017         }
7018     }
7019 }
7020
7021 static inline int decode_seq_parameter_set(H264Context *h){
7022     MpegEncContext * const s = &h->s;
7023     int profile_idc, level_idc;
7024     unsigned int sps_id;
7025     int i;
7026     SPS *sps;
7027
7028     profile_idc= get_bits(&s->gb, 8);
7029     get_bits1(&s->gb);   //constraint_set0_flag
7030     get_bits1(&s->gb);   //constraint_set1_flag
7031     get_bits1(&s->gb);   //constraint_set2_flag
7032     get_bits1(&s->gb);   //constraint_set3_flag
7033     get_bits(&s->gb, 4); // reserved
7034     level_idc= get_bits(&s->gb, 8);
7035     sps_id= get_ue_golomb(&s->gb);
7036
7037     if(sps_id >= MAX_SPS_COUNT) {
7038         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7039         return -1;
7040     }
7041     sps= av_mallocz(sizeof(SPS));
7042     if(sps == NULL)
7043         return -1;
7044
7045     sps->profile_idc= profile_idc;
7046     sps->level_idc= level_idc;
7047
7048     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7049     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7050     sps->scaling_matrix_present = 0;
7051
7052     if(sps->profile_idc >= 100){ //high profile
7053         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7054         if(sps->chroma_format_idc == 3)
7055             get_bits1(&s->gb);  //residual_color_transform_flag
7056         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7057         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7058         sps->transform_bypass = get_bits1(&s->gb);
7059         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7060     }else{
7061         sps->chroma_format_idc= 1;
7062     }
7063
7064     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7065     sps->poc_type= get_ue_golomb(&s->gb);
7066
7067     if(sps->poc_type == 0){ //FIXME #define
7068         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7069     } else if(sps->poc_type == 1){//FIXME #define
7070         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7071         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7072         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7073         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7074
7075         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7076             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7077             goto fail;
7078         }
7079
7080         for(i=0; i<sps->poc_cycle_length; i++)
7081             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7082     }else if(sps->poc_type != 2){
7083         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7084         goto fail;
7085     }
7086
7087     sps->ref_frame_count= get_ue_golomb(&s->gb);
7088     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7089         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7090         goto fail;
7091     }
7092     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7093     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7094     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7095     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7096        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7097         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7098         goto fail;
7099     }
7100
7101     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7102     if(!sps->frame_mbs_only_flag)
7103         sps->mb_aff= get_bits1(&s->gb);
7104     else
7105         sps->mb_aff= 0;
7106
7107     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7108
7109 #ifndef ALLOW_INTERLACE
7110     if(sps->mb_aff)
7111         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7112 #endif
7113     sps->crop= get_bits1(&s->gb);
7114     if(sps->crop){
7115         sps->crop_left  = get_ue_golomb(&s->gb);
7116         sps->crop_right = get_ue_golomb(&s->gb);
7117         sps->crop_top   = get_ue_golomb(&s->gb);
7118         sps->crop_bottom= get_ue_golomb(&s->gb);
7119         if(sps->crop_left || sps->crop_top){
7120             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7121         }
7122         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7123             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7124         }
7125     }else{
7126         sps->crop_left  =
7127         sps->crop_right =
7128         sps->crop_top   =
7129         sps->crop_bottom= 0;
7130     }
7131
7132     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7133     if( sps->vui_parameters_present_flag )
7134         decode_vui_parameters(h, sps);
7135
7136     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7137         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7138                sps_id, sps->profile_idc, sps->level_idc,
7139                sps->poc_type,
7140                sps->ref_frame_count,
7141                sps->mb_width, sps->mb_height,
7142                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7143                sps->direct_8x8_inference_flag ? "8B8" : "",
7144                sps->crop_left, sps->crop_right,
7145                sps->crop_top, sps->crop_bottom,
7146                sps->vui_parameters_present_flag ? "VUI" : "",
7147                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7148                );
7149     }
7150     av_free(h->sps_buffers[sps_id]);
7151     h->sps_buffers[sps_id]= sps;
7152     return 0;
7153 fail:
7154     av_free(sps);
7155     return -1;
7156 }
7157
7158 static void
7159 build_qp_table(PPS *pps, int t, int index)
7160 {
7161     int i;
7162     for(i = 0; i < 52; i++)
7163         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7164 }
7165
7166 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7167     MpegEncContext * const s = &h->s;
7168     unsigned int pps_id= get_ue_golomb(&s->gb);
7169     PPS *pps;
7170
7171     if(pps_id >= MAX_PPS_COUNT) {
7172         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7173         return -1;
7174     }
7175
7176     pps= av_mallocz(sizeof(PPS));
7177     if(pps == NULL)
7178         return -1;
7179     pps->sps_id= get_ue_golomb(&s->gb);
7180     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7181         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7182         goto fail;
7183     }
7184
7185     pps->cabac= get_bits1(&s->gb);
7186     pps->pic_order_present= get_bits1(&s->gb);
7187     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7188     if(pps->slice_group_count > 1 ){
7189         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7190         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7191         switch(pps->mb_slice_group_map_type){
7192         case 0:
7193 #if 0
7194 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7195 |    run_length[ i ]                                |1  |ue(v)   |
7196 #endif
7197             break;
7198         case 2:
7199 #if 0
7200 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7201 |{                                                  |   |        |
7202 |    top_left_mb[ i ]                               |1  |ue(v)   |
7203 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7204 |   }                                               |   |        |
7205 #endif
7206             break;
7207         case 3:
7208         case 4:
7209         case 5:
7210 #if 0
7211 |   slice_group_change_direction_flag               |1  |u(1)    |
7212 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7213 #endif
7214             break;
7215         case 6:
7216 #if 0
7217 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7218 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7219 |)                                                  |   |        |
7220 |    slice_group_id[ i ]                            |1  |u(v)    |
7221 #endif
7222             break;
7223         }
7224     }
7225     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7226     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7227     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7228         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7229         goto fail;
7230     }
7231
7232     pps->weighted_pred= get_bits1(&s->gb);
7233     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7234     pps->init_qp= get_se_golomb(&s->gb) + 26;
7235     pps->init_qs= get_se_golomb(&s->gb) + 26;
7236     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7237     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7238     pps->constrained_intra_pred= get_bits1(&s->gb);
7239     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7240
7241     pps->transform_8x8_mode= 0;
7242     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7243     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7244     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7245
7246     if(get_bits_count(&s->gb) < bit_length){
7247         pps->transform_8x8_mode= get_bits1(&s->gb);
7248         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7249         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7250     } else {
7251         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7252     }
7253
7254     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7255     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7256     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7257         h->pps.chroma_qp_diff= 1;
7258
7259     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7260         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7261                pps_id, pps->sps_id,
7262                pps->cabac ? "CABAC" : "CAVLC",
7263                pps->slice_group_count,
7264                pps->ref_count[0], pps->ref_count[1],
7265                pps->weighted_pred ? "weighted" : "",
7266                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7267                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7268                pps->constrained_intra_pred ? "CONSTR" : "",
7269                pps->redundant_pic_cnt_present ? "REDU" : "",
7270                pps->transform_8x8_mode ? "8x8DCT" : ""
7271                );
7272     }
7273
7274     av_free(h->pps_buffers[pps_id]);
7275     h->pps_buffers[pps_id]= pps;
7276     return 0;
7277 fail:
7278     av_free(pps);
7279     return -1;
7280 }
7281
7282 /**
7283  * Call decode_slice() for each context.
7284  *
7285  * @param h h264 master context
7286  * @param context_count number of contexts to execute
7287  */
7288 static void execute_decode_slices(H264Context *h, int context_count){
7289     MpegEncContext * const s = &h->s;
7290     AVCodecContext * const avctx= s->avctx;
7291     H264Context *hx;
7292     int i;
7293
7294     if(context_count == 1) {
7295         decode_slice(avctx, &h);
7296     } else {
7297         for(i = 1; i < context_count; i++) {
7298             hx = h->thread_context[i];
7299             hx->s.error_recognition = avctx->error_recognition;
7300             hx->s.error_count = 0;
7301         }
7302
7303         avctx->execute(avctx, (void *)decode_slice,
7304                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7305
7306         /* pull back stuff from slices to master context */
7307         hx = h->thread_context[context_count - 1];
7308         s->mb_x = hx->s.mb_x;
7309         s->mb_y = hx->s.mb_y;
7310         s->dropable = hx->s.dropable;
7311         s->picture_structure = hx->s.picture_structure;
7312         for(i = 1; i < context_count; i++)
7313             h->s.error_count += h->thread_context[i]->s.error_count;
7314     }
7315 }
7316
7317
7318 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7319     MpegEncContext * const s = &h->s;
7320     AVCodecContext * const avctx= s->avctx;
7321     int buf_index=0;
7322     H264Context *hx; ///< thread context
7323     int context_count = 0;
7324
7325     h->max_contexts = avctx->thread_count;
7326 #if 0
7327     int i;
7328     for(i=0; i<50; i++){
7329         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7330     }
7331 #endif
7332     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7333         h->current_slice = 0;
7334         if (!s->first_field)
7335             s->current_picture_ptr= NULL;
7336     }
7337
7338     for(;;){
7339         int consumed;
7340         int dst_length;
7341         int bit_length;
7342         const uint8_t *ptr;
7343         int i, nalsize = 0;
7344         int err;
7345
7346         if(h->is_avc) {
7347             if(buf_index >= buf_size) break;
7348             nalsize = 0;
7349             for(i = 0; i < h->nal_length_size; i++)
7350                 nalsize = (nalsize << 8) | buf[buf_index++];
7351             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7352                 if(nalsize == 1){
7353                     buf_index++;
7354                     continue;
7355                 }else{
7356                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7357                     break;
7358                 }
7359             }
7360         } else {
7361             // start code prefix search
7362             for(; buf_index + 3 < buf_size; buf_index++){
7363                 // This should always succeed in the first iteration.
7364                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7365                     break;
7366             }
7367
7368             if(buf_index+3 >= buf_size) break;
7369
7370             buf_index+=3;
7371         }
7372
7373         hx = h->thread_context[context_count];
7374
7375         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7376         if (ptr==NULL || dst_length < 0){
7377             return -1;
7378         }
7379         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7380             dst_length--;
7381         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7382
7383         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7384             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7385         }
7386
7387         if (h->is_avc && (nalsize != consumed)){
7388             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7389             consumed= nalsize;
7390         }
7391
7392         buf_index += consumed;
7393
7394         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7395            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7396             continue;
7397
7398       again:
7399         err = 0;
7400         switch(hx->nal_unit_type){
7401         case NAL_IDR_SLICE:
7402             if (h->nal_unit_type != NAL_IDR_SLICE) {
7403                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7404                 return -1;
7405             }
7406             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7407         case NAL_SLICE:
7408             init_get_bits(&hx->s.gb, ptr, bit_length);
7409             hx->intra_gb_ptr=
7410             hx->inter_gb_ptr= &hx->s.gb;
7411             hx->s.data_partitioning = 0;
7412
7413             if((err = decode_slice_header(hx, h)))
7414                break;
7415
7416             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7417             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7418                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7419                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7420                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7421                && avctx->skip_frame < AVDISCARD_ALL)
7422                 context_count++;
7423             break;
7424         case NAL_DPA:
7425             init_get_bits(&hx->s.gb, ptr, bit_length);
7426             hx->intra_gb_ptr=
7427             hx->inter_gb_ptr= NULL;
7428             hx->s.data_partitioning = 1;
7429
7430             err = decode_slice_header(hx, h);
7431             break;
7432         case NAL_DPB:
7433             init_get_bits(&hx->intra_gb, ptr, bit_length);
7434             hx->intra_gb_ptr= &hx->intra_gb;
7435             break;
7436         case NAL_DPC:
7437             init_get_bits(&hx->inter_gb, ptr, bit_length);
7438             hx->inter_gb_ptr= &hx->inter_gb;
7439
7440             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7441                && s->context_initialized
7442                && s->hurry_up < 5
7443                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7444                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7445                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7446                && avctx->skip_frame < AVDISCARD_ALL)
7447                 context_count++;
7448             break;
7449         case NAL_SEI:
7450             init_get_bits(&s->gb, ptr, bit_length);
7451             decode_sei(h);
7452             break;
7453         case NAL_SPS:
7454             init_get_bits(&s->gb, ptr, bit_length);
7455             decode_seq_parameter_set(h);
7456
7457             if(s->flags& CODEC_FLAG_LOW_DELAY)
7458                 s->low_delay=1;
7459
7460             if(avctx->has_b_frames < 2)
7461                 avctx->has_b_frames= !s->low_delay;
7462             break;
7463         case NAL_PPS:
7464             init_get_bits(&s->gb, ptr, bit_length);
7465
7466             decode_picture_parameter_set(h, bit_length);
7467
7468             break;
7469         case NAL_AUD:
7470         case NAL_END_SEQUENCE:
7471         case NAL_END_STREAM:
7472         case NAL_FILLER_DATA:
7473         case NAL_SPS_EXT:
7474         case NAL_AUXILIARY_SLICE:
7475             break;
7476         default:
7477             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7478         }
7479
7480         if(context_count == h->max_contexts) {
7481             execute_decode_slices(h, context_count);
7482             context_count = 0;
7483         }
7484
7485         if (err < 0)
7486             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7487         else if(err == 1) {
7488             /* Slice could not be decoded in parallel mode, copy down
7489              * NAL unit stuff to context 0 and restart. Note that
7490              * rbsp_buffer is not transferred, but since we no longer
7491              * run in parallel mode this should not be an issue. */
7492             h->nal_unit_type = hx->nal_unit_type;
7493             h->nal_ref_idc   = hx->nal_ref_idc;
7494             hx = h;
7495             goto again;
7496         }
7497     }
7498     if(context_count)
7499         execute_decode_slices(h, context_count);
7500     return buf_index;
7501 }
7502
7503 /**
7504  * returns the number of bytes consumed for building the current frame
7505  */
7506 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7507         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7508         if(pos+10>buf_size) pos=buf_size; // oops ;)
7509
7510         return pos;
7511 }
7512
7513 static int decode_frame(AVCodecContext *avctx,
7514                              void *data, int *data_size,
7515                              const uint8_t *buf, int buf_size)
7516 {
7517     H264Context *h = avctx->priv_data;
7518     MpegEncContext *s = &h->s;
7519     AVFrame *pict = data;
7520     int buf_index;
7521
7522     s->flags= avctx->flags;
7523     s->flags2= avctx->flags2;
7524
7525    /* end of stream, output what is still in the buffers */
7526     if (buf_size == 0) {
7527         Picture *out;
7528         int i, out_idx;
7529
7530 //FIXME factorize this with the output code below
7531         out = h->delayed_pic[0];
7532         out_idx = 0;
7533         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7534             if(h->delayed_pic[i]->poc < out->poc){
7535                 out = h->delayed_pic[i];
7536                 out_idx = i;
7537             }
7538
7539         for(i=out_idx; h->delayed_pic[i]; i++)
7540             h->delayed_pic[i] = h->delayed_pic[i+1];
7541
7542         if(out){
7543             *data_size = sizeof(AVFrame);
7544             *pict= *(AVFrame*)out;
7545         }
7546
7547         return 0;
7548     }
7549
7550     if(h->is_avc && !h->got_avcC) {
7551         int i, cnt, nalsize;
7552         unsigned char *p = avctx->extradata;
7553         if(avctx->extradata_size < 7) {
7554             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7555             return -1;
7556         }
7557         if(*p != 1) {
7558             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7559             return -1;
7560         }
7561         /* sps and pps in the avcC always have length coded with 2 bytes,
7562            so put a fake nal_length_size = 2 while parsing them */
7563         h->nal_length_size = 2;
7564         // Decode sps from avcC
7565         cnt = *(p+5) & 0x1f; // Number of sps
7566         p += 6;
7567         for (i = 0; i < cnt; i++) {
7568             nalsize = AV_RB16(p) + 2;
7569             if(decode_nal_units(h, p, nalsize) < 0) {
7570                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7571                 return -1;
7572             }
7573             p += nalsize;
7574         }
7575         // Decode pps from avcC
7576         cnt = *(p++); // Number of pps
7577         for (i = 0; i < cnt; i++) {
7578             nalsize = AV_RB16(p) + 2;
7579             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7580                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7581                 return -1;
7582             }
7583             p += nalsize;
7584         }
7585         // Now store right nal length size, that will be use to parse all other nals
7586         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7587         // Do not reparse avcC
7588         h->got_avcC = 1;
7589     }
7590
7591     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7592         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7593             return -1;
7594         h->got_avcC = 1;
7595     }
7596
7597     buf_index=decode_nal_units(h, buf, buf_size);
7598     if(buf_index < 0)
7599         return -1;
7600
7601     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7602         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7603         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7604         return -1;
7605     }
7606
7607     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7608         Picture *out = s->current_picture_ptr;
7609         Picture *cur = s->current_picture_ptr;
7610         int i, pics, cross_idr, out_of_order, out_idx;
7611
7612         s->mb_y= 0;
7613
7614         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7615         s->current_picture_ptr->pict_type= s->pict_type;
7616
7617         if(!s->dropable) {
7618             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7619             h->prev_poc_msb= h->poc_msb;
7620             h->prev_poc_lsb= h->poc_lsb;
7621         }
7622         h->prev_frame_num_offset= h->frame_num_offset;
7623         h->prev_frame_num= h->frame_num;
7624
7625         /*
7626          * FIXME: Error handling code does not seem to support interlaced
7627          * when slices span multiple rows
7628          * The ff_er_add_slice calls don't work right for bottom
7629          * fields; they cause massive erroneous error concealing
7630          * Error marking covers both fields (top and bottom).
7631          * This causes a mismatched s->error_count
7632          * and a bad error table. Further, the error count goes to
7633          * INT_MAX when called for bottom field, because mb_y is
7634          * past end by one (callers fault) and resync_mb_y != 0
7635          * causes problems for the first MB line, too.
7636          */
7637         if (!FIELD_PICTURE)
7638             ff_er_frame_end(s);
7639
7640         MPV_frame_end(s);
7641
7642         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7643             /* Wait for second field. */
7644             *data_size = 0;
7645
7646         } else {
7647             cur->repeat_pict = 0;
7648
7649             /* Signal interlacing information externally. */
7650             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7651             if(h->sps.pic_struct_present_flag){
7652                 switch (h->sei_pic_struct)
7653                 {
7654                 case SEI_PIC_STRUCT_FRAME:
7655                     cur->interlaced_frame = 0;
7656                     break;
7657                 case SEI_PIC_STRUCT_TOP_FIELD:
7658                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7659                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7660                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7661                     cur->interlaced_frame = 1;
7662                     break;
7663                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7664                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7665                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7666                     // From these hints, let the applications decide if they apply deinterlacing.
7667                     cur->repeat_pict = 1;
7668                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7669                     break;
7670                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7671                     // Force progressive here, as doubling interlaced frame is a bad idea.
7672                     cur->interlaced_frame = 0;
7673                     cur->repeat_pict = 2;
7674                     break;
7675                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7676                     cur->interlaced_frame = 0;
7677                     cur->repeat_pict = 4;
7678                     break;
7679                 }
7680             }else{
7681                 /* Derive interlacing flag from used decoding process. */
7682                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7683             }
7684
7685             if (cur->field_poc[0] != cur->field_poc[1]){
7686                 /* Derive top_field_first from field pocs. */
7687                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7688             }else{
7689                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7690                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7691                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7692                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7693                         cur->top_field_first = 1;
7694                     else
7695                         cur->top_field_first = 0;
7696                 }else{
7697                     /* Most likely progressive */
7698                     cur->top_field_first = 0;
7699                 }
7700             }
7701
7702         //FIXME do something with unavailable reference frames
7703
7704             /* Sort B-frames into display order */
7705
7706             if(h->sps.bitstream_restriction_flag
7707                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7708                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7709                 s->low_delay = 0;
7710             }
7711
7712             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7713                && !h->sps.bitstream_restriction_flag){
7714                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7715                 s->low_delay= 0;
7716             }
7717
7718             pics = 0;
7719             while(h->delayed_pic[pics]) pics++;
7720
7721             assert(pics <= MAX_DELAYED_PIC_COUNT);
7722
7723             h->delayed_pic[pics++] = cur;
7724             if(cur->reference == 0)
7725                 cur->reference = DELAYED_PIC_REF;
7726
7727             out = h->delayed_pic[0];
7728             out_idx = 0;
7729             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7730                 if(h->delayed_pic[i]->poc < out->poc){
7731                     out = h->delayed_pic[i];
7732                     out_idx = i;
7733                 }
7734             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7735
7736             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7737
7738             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7739                 { }
7740             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7741                || (s->low_delay &&
7742                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7743                  || cur->pict_type == FF_B_TYPE)))
7744             {
7745                 s->low_delay = 0;
7746                 s->avctx->has_b_frames++;
7747             }
7748
7749             if(out_of_order || pics > s->avctx->has_b_frames){
7750                 out->reference &= ~DELAYED_PIC_REF;
7751                 for(i=out_idx; h->delayed_pic[i]; i++)
7752                     h->delayed_pic[i] = h->delayed_pic[i+1];
7753             }
7754             if(!out_of_order && pics > s->avctx->has_b_frames){
7755                 *data_size = sizeof(AVFrame);
7756
7757                 h->outputed_poc = out->poc;
7758                 *pict= *(AVFrame*)out;
7759             }else{
7760                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7761             }
7762         }
7763     }
7764
7765     assert(pict->data[0] || !*data_size);
7766     ff_print_debug_info(s, pict);
7767 //printf("out %d\n", (int)pict->data[0]);
7768 #if 0 //?
7769
7770     /* Return the Picture timestamp as the frame number */
7771     /* we subtract 1 because it is added on utils.c     */
7772     avctx->frame_number = s->picture_number - 1;
7773 #endif
7774     return get_consumed_bytes(s, buf_index, buf_size);
7775 }
7776 #if 0
7777 static inline void fill_mb_avail(H264Context *h){
7778     MpegEncContext * const s = &h->s;
7779     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7780
7781     if(s->mb_y){
7782         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7783         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7784         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7785     }else{
7786         h->mb_avail[0]=
7787         h->mb_avail[1]=
7788         h->mb_avail[2]= 0;
7789     }
7790     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7791     h->mb_avail[4]= 1; //FIXME move out
7792     h->mb_avail[5]= 0; //FIXME move out
7793 }
7794 #endif
7795
7796 #ifdef TEST
7797 #undef printf
7798 #undef random
7799 #define COUNT 8000
7800 #define SIZE (COUNT*40)
7801 int main(void){
7802     int i;
7803     uint8_t temp[SIZE];
7804     PutBitContext pb;
7805     GetBitContext gb;
7806 //    int int_temp[10000];
7807     DSPContext dsp;
7808     AVCodecContext avctx;
7809
7810     dsputil_init(&dsp, &avctx);
7811
7812     init_put_bits(&pb, temp, SIZE);
7813     printf("testing unsigned exp golomb\n");
7814     for(i=0; i<COUNT; i++){
7815         START_TIMER
7816         set_ue_golomb(&pb, i);
7817         STOP_TIMER("set_ue_golomb");
7818     }
7819     flush_put_bits(&pb);
7820
7821     init_get_bits(&gb, temp, 8*SIZE);
7822     for(i=0; i<COUNT; i++){
7823         int j, s;
7824
7825         s= show_bits(&gb, 24);
7826
7827         START_TIMER
7828         j= get_ue_golomb(&gb);
7829         if(j != i){
7830             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7831 //            return -1;
7832         }
7833         STOP_TIMER("get_ue_golomb");
7834     }
7835
7836
7837     init_put_bits(&pb, temp, SIZE);
7838     printf("testing signed exp golomb\n");
7839     for(i=0; i<COUNT; i++){
7840         START_TIMER
7841         set_se_golomb(&pb, i - COUNT/2);
7842         STOP_TIMER("set_se_golomb");
7843     }
7844     flush_put_bits(&pb);
7845
7846     init_get_bits(&gb, temp, 8*SIZE);
7847     for(i=0; i<COUNT; i++){
7848         int j, s;
7849
7850         s= show_bits(&gb, 24);
7851
7852         START_TIMER
7853         j= get_se_golomb(&gb);
7854         if(j != i - COUNT/2){
7855             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7856 //            return -1;
7857         }
7858         STOP_TIMER("get_se_golomb");
7859     }
7860
7861 #if 0
7862     printf("testing 4x4 (I)DCT\n");
7863
7864     DCTELEM block[16];
7865     uint8_t src[16], ref[16];
7866     uint64_t error= 0, max_error=0;
7867
7868     for(i=0; i<COUNT; i++){
7869         int j;
7870 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7871         for(j=0; j<16; j++){
7872             ref[j]= random()%255;
7873             src[j]= random()%255;
7874         }
7875
7876         h264_diff_dct_c(block, src, ref, 4);
7877
7878         //normalize
7879         for(j=0; j<16; j++){
7880 //            printf("%d ", block[j]);
7881             block[j]= block[j]*4;
7882             if(j&1) block[j]= (block[j]*4 + 2)/5;
7883             if(j&4) block[j]= (block[j]*4 + 2)/5;
7884         }
7885 //        printf("\n");
7886
7887         s->dsp.h264_idct_add(ref, block, 4);
7888 /*        for(j=0; j<16; j++){
7889             printf("%d ", ref[j]);
7890         }
7891         printf("\n");*/
7892
7893         for(j=0; j<16; j++){
7894             int diff= FFABS(src[j] - ref[j]);
7895
7896             error+= diff*diff;
7897             max_error= FFMAX(max_error, diff);
7898         }
7899     }
7900     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7901     printf("testing quantizer\n");
7902     for(qp=0; qp<52; qp++){
7903         for(i=0; i<16; i++)
7904             src1_block[i]= src2_block[i]= random()%255;
7905
7906     }
7907     printf("Testing NAL layer\n");
7908
7909     uint8_t bitstream[COUNT];
7910     uint8_t nal[COUNT*2];
7911     H264Context h;
7912     memset(&h, 0, sizeof(H264Context));
7913
7914     for(i=0; i<COUNT; i++){
7915         int zeros= i;
7916         int nal_length;
7917         int consumed;
7918         int out_length;
7919         uint8_t *out;
7920         int j;
7921
7922         for(j=0; j<COUNT; j++){
7923             bitstream[j]= (random() % 255) + 1;
7924         }
7925
7926         for(j=0; j<zeros; j++){
7927             int pos= random() % COUNT;
7928             while(bitstream[pos] == 0){
7929                 pos++;
7930                 pos %= COUNT;
7931             }
7932             bitstream[pos]=0;
7933         }
7934
7935         START_TIMER
7936
7937         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7938         if(nal_length<0){
7939             printf("encoding failed\n");
7940             return -1;
7941         }
7942
7943         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7944
7945         STOP_TIMER("NAL")
7946
7947         if(out_length != COUNT){
7948             printf("incorrect length %d %d\n", out_length, COUNT);
7949             return -1;
7950         }
7951
7952         if(consumed != nal_length){
7953             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7954             return -1;
7955         }
7956
7957         if(memcmp(bitstream, out, COUNT)){
7958             printf("mismatch\n");
7959             return -1;
7960         }
7961     }
7962 #endif
7963
7964     printf("Testing RBSP\n");
7965
7966
7967     return 0;
7968 }
7969 #endif /* TEST */
7970
7971
7972 static av_cold int decode_end(AVCodecContext *avctx)
7973 {
7974     H264Context *h = avctx->priv_data;
7975     MpegEncContext *s = &h->s;
7976     int i;
7977
7978     av_freep(&h->rbsp_buffer[0]);
7979     av_freep(&h->rbsp_buffer[1]);
7980     free_tables(h); //FIXME cleanup init stuff perhaps
7981
7982     for(i = 0; i < MAX_SPS_COUNT; i++)
7983         av_freep(h->sps_buffers + i);
7984
7985     for(i = 0; i < MAX_PPS_COUNT; i++)
7986         av_freep(h->pps_buffers + i);
7987
7988     MPV_common_end(s);
7989
7990 //    memset(h, 0, sizeof(H264Context));
7991
7992     return 0;
7993 }
7994
7995
7996 AVCodec h264_decoder = {
7997     "h264",
7998     CODEC_TYPE_VIDEO,
7999     CODEC_ID_H264,
8000     sizeof(H264Context),
8001     decode_init,
8002     NULL,
8003     decode_end,
8004     decode_frame,
8005     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8006     .flush= flush_dpb,
8007     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8008 };
8009
8010 #include "svq3.c"