git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1434
1435     *dst_length= di;
1436     *consumed= si + 1;//+1 for the header
1437 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1438     return dst;
1439 }
1440
1441 /**
1442  * identifies the exact end of the bitstream
1443  * @return the length of the trailing, or 0 if damaged
1444  */
1445 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1446     int v= *src;
1447     int r;
1448
1449     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1450
1451     for(r=1; r<9; r++){
1452         if(v&1) return r;
1453         v>>=1;
1454     }
1455     return 0;
1456 }
1457
1458 /**
1459  * IDCT transforms the 16 dc values and dequantizes them.
1460  * @param qp quantization parameter
1461  */
1462 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1463 #define stride 16
1464     int i;
1465     int temp[16]; //FIXME check if this is a good idea
1466     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1467     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1468
1469 //memset(block, 64, 2*256);
1470 //return;
1471     for(i=0; i<4; i++){
1472         const int offset= y_offset[i];
1473         const int z0= block[offset+stride*0] + block[offset+stride*4];
1474         const int z1= block[offset+stride*0] - block[offset+stride*4];
1475         const int z2= block[offset+stride*1] - block[offset+stride*5];
1476         const int z3= block[offset+stride*1] + block[offset+stride*5];
1477
1478         temp[4*i+0]= z0+z3;
1479         temp[4*i+1]= z1+z2;
1480         temp[4*i+2]= z1-z2;
1481         temp[4*i+3]= z0-z3;
1482     }
1483
1484     for(i=0; i<4; i++){
1485         const int offset= x_offset[i];
1486         const int z0= temp[4*0+i] + temp[4*2+i];
1487         const int z1= temp[4*0+i] - temp[4*2+i];
1488         const int z2= temp[4*1+i] - temp[4*3+i];
1489         const int z3= temp[4*1+i] + temp[4*3+i];
1490
1491         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1492         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1493         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1494         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1495     }
1496 }
1497
1498 #if 0
1499 /**
1500  * DCT transforms the 16 dc values.
1501  * @param qp quantization parameter ??? FIXME
1502  */
1503 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1504 //    const int qmul= dequant_coeff[qp][0];
1505     int i;
1506     int temp[16]; //FIXME check if this is a good idea
1507     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1508     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1509
1510     for(i=0; i<4; i++){
1511         const int offset= y_offset[i];
1512         const int z0= block[offset+stride*0] + block[offset+stride*4];
1513         const int z1= block[offset+stride*0] - block[offset+stride*4];
1514         const int z2= block[offset+stride*1] - block[offset+stride*5];
1515         const int z3= block[offset+stride*1] + block[offset+stride*5];
1516
1517         temp[4*i+0]= z0+z3;
1518         temp[4*i+1]= z1+z2;
1519         temp[4*i+2]= z1-z2;
1520         temp[4*i+3]= z0-z3;
1521     }
1522
1523     for(i=0; i<4; i++){
1524         const int offset= x_offset[i];
1525         const int z0= temp[4*0+i] + temp[4*2+i];
1526         const int z1= temp[4*0+i] - temp[4*2+i];
1527         const int z2= temp[4*1+i] - temp[4*3+i];
1528         const int z3= temp[4*1+i] + temp[4*3+i];
1529
1530         block[stride*0 +offset]= (z0 + z3)>>1;
1531         block[stride*2 +offset]= (z1 + z2)>>1;
1532         block[stride*8 +offset]= (z1 - z2)>>1;
1533         block[stride*10+offset]= (z0 - z3)>>1;
1534     }
1535 }
1536 #endif
1537
1538 #undef xStride
1539 #undef stride
1540
1541 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1542     const int stride= 16*2;
1543     const int xStride= 16;
1544     int a,b,c,d,e;
1545
1546     a= block[stride*0 + xStride*0];
1547     b= block[stride*0 + xStride*1];
1548     c= block[stride*1 + xStride*0];
1549     d= block[stride*1 + xStride*1];
1550
1551     e= a-b;
1552     a= a+b;
1553     b= c-d;
1554     c= c+d;
1555
1556     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1557     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1558     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1559     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1560 }
1561
1562 #if 0
1563 static void chroma_dc_dct_c(DCTELEM *block){
1564     const int stride= 16*2;
1565     const int xStride= 16;
1566     int a,b,c,d,e;
1567
1568     a= block[stride*0 + xStride*0];
1569     b= block[stride*0 + xStride*1];
1570     c= block[stride*1 + xStride*0];
1571     d= block[stride*1 + xStride*1];
1572
1573     e= a-b;
1574     a= a+b;
1575     b= c-d;
1576     c= c+d;
1577
1578     block[stride*0 + xStride*0]= (a+c);
1579     block[stride*0 + xStride*1]= (e+b);
1580     block[stride*1 + xStride*0]= (a-c);
1581     block[stride*1 + xStride*1]= (e-b);
1582 }
1583 #endif
1584
1585 /**
1586  * gets the chroma qp.
1587  */
1588 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1589     return h->pps.chroma_qp_table[t][qscale];
1590 }
1591
1592 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1593                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1594                            int src_x_offset, int src_y_offset,
1595                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1596     MpegEncContext * const s = &h->s;
1597     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1598     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1599     const int luma_xy= (mx&3) + ((my&3)<<2);
1600     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1601     uint8_t * src_cb, * src_cr;
1602     int extra_width= h->emu_edge_width;
1603     int extra_height= h->emu_edge_height;
1604     int emu=0;
1605     const int full_mx= mx>>2;
1606     const int full_my= my>>2;
1607     const int pic_width  = 16*s->mb_width;
1608     const int pic_height = 16*s->mb_height >> MB_FIELD;
1609
1610     if(mx&7) extra_width -= 3;
1611     if(my&7) extra_height -= 3;
1612
1613     if(   full_mx < 0-extra_width
1614        || full_my < 0-extra_height
1615        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1616        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1617         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1618             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1619         emu=1;
1620     }
1621
1622     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1623     if(!square){
1624         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1625     }
1626
1627     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1628
1629     if(MB_FIELD){
1630         // chroma offset when predicting from a field of opposite parity
1631         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1632         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1633     }
1634     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1635     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636
1637     if(emu){
1638         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1639             src_cb= s->edge_emu_buffer;
1640     }
1641     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1642
1643     if(emu){
1644         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1645             src_cr= s->edge_emu_buffer;
1646     }
1647     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1648 }
1649
1650 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1651                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1652                            int x_offset, int y_offset,
1653                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1654                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1655                            int list0, int list1){
1656     MpegEncContext * const s = &h->s;
1657     qpel_mc_func *qpix_op=  qpix_put;
1658     h264_chroma_mc_func chroma_op= chroma_put;
1659
1660     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1661     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1662     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     x_offset += 8*s->mb_x;
1664     y_offset += 8*(s->mb_y >> MB_FIELD);
1665
1666     if(list0){
1667         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1668         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1669                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1670                            qpix_op, chroma_op);
1671
1672         qpix_op=  qpix_avg;
1673         chroma_op= chroma_avg;
1674     }
1675
1676     if(list1){
1677         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681     }
1682 }
1683
1684 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1685                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1686                            int x_offset, int y_offset,
1687                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1688                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1689                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1690                            int list0, int list1){
1691     MpegEncContext * const s = &h->s;
1692
1693     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1694     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1695     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     x_offset += 8*s->mb_x;
1697     y_offset += 8*(s->mb_y >> MB_FIELD);
1698
1699     if(list0 && list1){
1700         /* don't optimize for luma-only case, since B-frames usually
1701          * use implicit weights => chroma too. */
1702         uint8_t *tmp_cb = s->obmc_scratchpad;
1703         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1704         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1705         int refn0 = h->ref_cache[0][ scan8[n] ];
1706         int refn1 = h->ref_cache[1][ scan8[n] ];
1707
1708         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1709                     dest_y, dest_cb, dest_cr,
1710                     x_offset, y_offset, qpix_put, chroma_put);
1711         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1712                     tmp_y, tmp_cb, tmp_cr,
1713                     x_offset, y_offset, qpix_put, chroma_put);
1714
1715         if(h->use_weight == 2){
1716             int weight0 = h->implicit_weight[refn0][refn1];
1717             int weight1 = 64 - weight0;
1718             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1719             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721         }else{
1722             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1723                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1724                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1725             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1726                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1727                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1729                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1730                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1731         }
1732     }else{
1733         int list = list1 ? 1 : 0;
1734         int refn = h->ref_cache[list][ scan8[n] ];
1735         Picture *ref= &h->ref_list[list][refn];
1736         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1737                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1738                     qpix_put, chroma_put);
1739
1740         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1742         if(h->use_weight_chroma){
1743             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1745             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1746                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1747         }
1748     }
1749 }
1750
1751 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1757                            int list0, int list1){
1758     if((h->use_weight==2 && list0 && list1
1759         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1760        || h->use_weight==1)
1761         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1762                          x_offset, y_offset, qpix_put, chroma_put,
1763                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1764     else
1765         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1766                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1767 }
1768
1769 static inline void prefetch_motion(H264Context *h, int list){
1770     /* fetch pixels for estimated mv 4 macroblocks ahead
1771      * optimized for 64byte cache lines */
1772     MpegEncContext * const s = &h->s;
1773     const int refn = h->ref_cache[list][scan8[0]];
1774     if(refn >= 0){
1775         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1776         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1777         uint8_t **src= h->ref_list[list][refn].data;
1778         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1779         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1780         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1781         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1782     }
1783 }
1784
1785 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1786                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1787                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1788                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1789     MpegEncContext * const s = &h->s;
1790     const int mb_xy= h->mb_xy;
1791     const int mb_type= s->current_picture.mb_type[mb_xy];
1792
1793     assert(IS_INTER(mb_type));
1794
1795     prefetch_motion(h, 0);
1796
1797     if(IS_16X16(mb_type)){
1798         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1799                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1800                 &weight_op[0], &weight_avg[0],
1801                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1802     }else if(IS_16X8(mb_type)){
1803         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1804                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1805                 &weight_op[1], &weight_avg[1],
1806                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1807         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1808                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1809                 &weight_op[1], &weight_avg[1],
1810                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1811     }else if(IS_8X16(mb_type)){
1812         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1813                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1814                 &weight_op[2], &weight_avg[2],
1815                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1816         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1817                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1818                 &weight_op[2], &weight_avg[2],
1819                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1820     }else{
1821         int i;
1822
1823         assert(IS_8X8(mb_type));
1824
1825         for(i=0; i<4; i++){
1826             const int sub_mb_type= h->sub_mb_type[i];
1827             const int n= 4*i;
1828             int x_offset= (i&1)<<2;
1829             int y_offset= (i&2)<<1;
1830
1831             if(IS_SUB_8X8(sub_mb_type)){
1832                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1833                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1834                     &weight_op[3], &weight_avg[3],
1835                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1836             }else if(IS_SUB_8X4(sub_mb_type)){
1837                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1838                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1839                     &weight_op[4], &weight_avg[4],
1840                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1841                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1842                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1843                     &weight_op[4], &weight_avg[4],
1844                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1845             }else if(IS_SUB_4X8(sub_mb_type)){
1846                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1847                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1848                     &weight_op[5], &weight_avg[5],
1849                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1850                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1851                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1852                     &weight_op[5], &weight_avg[5],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else{
1855                 int j;
1856                 assert(IS_SUB_4X4(sub_mb_type));
1857                 for(j=0; j<4; j++){
1858                     int sub_x_offset= x_offset + 2*(j&1);
1859                     int sub_y_offset= y_offset +   (j&2);
1860                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1861                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                         &weight_op[6], &weight_avg[6],
1863                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864                 }
1865             }
1866         }
1867     }
1868
1869     prefetch_motion(h, 1);
1870 }
1871
1872 static av_cold void decode_init_vlc(void){
1873     static int done = 0;
1874
1875     if (!done) {
1876         int i;
1877         int offset;
1878         done = 1;
1879
1880         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1881         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1882         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1883                  &chroma_dc_coeff_token_len [0], 1, 1,
1884                  &chroma_dc_coeff_token_bits[0], 1, 1,
1885                  INIT_VLC_USE_NEW_STATIC);
1886
1887         offset = 0;
1888         for(i=0; i<4; i++){
1889             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1890             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1891             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1892                      &coeff_token_len [i][0], 1, 1,
1893                      &coeff_token_bits[i][0], 1, 1,
1894                      INIT_VLC_USE_NEW_STATIC);
1895             offset += coeff_token_vlc_tables_size[i];
1896         }
1897         /*
1898          * This is a one time safety check to make sure that
1899          * the packed static coeff_token_vlc table sizes
1900          * were initialized correctly.
1901          */
1902         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1903
1904         for(i=0; i<3; i++){
1905             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1906             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1907             init_vlc(&chroma_dc_total_zeros_vlc[i],
1908                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1909                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1910                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1911                      INIT_VLC_USE_NEW_STATIC);
1912         }
1913         for(i=0; i<15; i++){
1914             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1915             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1916             init_vlc(&total_zeros_vlc[i],
1917                      TOTAL_ZEROS_VLC_BITS, 16,
1918                      &total_zeros_len [i][0], 1, 1,
1919                      &total_zeros_bits[i][0], 1, 1,
1920                      INIT_VLC_USE_NEW_STATIC);
1921         }
1922
1923         for(i=0; i<6; i++){
1924             run_vlc[i].table = run_vlc_tables[i];
1925             run_vlc[i].table_allocated = run_vlc_tables_size;
1926             init_vlc(&run_vlc[i],
1927                      RUN_VLC_BITS, 7,
1928                      &run_len [i][0], 1, 1,
1929                      &run_bits[i][0], 1, 1,
1930                      INIT_VLC_USE_NEW_STATIC);
1931         }
1932         run7_vlc.table = run7_vlc_table,
1933         run7_vlc.table_allocated = run7_vlc_table_size;
1934         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1935                  &run_len [6][0], 1, 1,
1936                  &run_bits[6][0], 1, 1,
1937                  INIT_VLC_USE_NEW_STATIC);
1938     }
1939 }
1940
1941 static void free_tables(H264Context *h){
1942     int i;
1943     H264Context *hx;
1944     av_freep(&h->intra4x4_pred_mode);
1945     av_freep(&h->chroma_pred_mode_table);
1946     av_freep(&h->cbp_table);
1947     av_freep(&h->mvd_table[0]);
1948     av_freep(&h->mvd_table[1]);
1949     av_freep(&h->direct_table);
1950     av_freep(&h->non_zero_count);
1951     av_freep(&h->slice_table_base);
1952     h->slice_table= NULL;
1953
1954     av_freep(&h->mb2b_xy);
1955     av_freep(&h->mb2b8_xy);
1956
1957     for(i = 0; i < h->s.avctx->thread_count; i++) {
1958         hx = h->thread_context[i];
1959         if(!hx) continue;
1960         av_freep(&hx->top_borders[1]);
1961         av_freep(&hx->top_borders[0]);
1962         av_freep(&hx->s.obmc_scratchpad);
1963     }
1964 }
1965
1966 static void init_dequant8_coeff_table(H264Context *h){
1967     int i,q,x;
1968     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1969     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1970     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1971
1972     for(i=0; i<2; i++ ){
1973         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1974             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1975             break;
1976         }
1977
1978         for(q=0; q<52; q++){
1979             int shift = div6[q];
1980             int idx = rem6[q];
1981             for(x=0; x<64; x++)
1982                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1983                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1984                     h->pps.scaling_matrix8[i][x]) << shift;
1985         }
1986     }
1987 }
1988
1989 static void init_dequant4_coeff_table(H264Context *h){
1990     int i,j,q,x;
1991     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1992     for(i=0; i<6; i++ ){
1993         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1994         for(j=0; j<i; j++){
1995             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1996                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1997                 break;
1998             }
1999         }
2000         if(j<i)
2001             continue;
2002
2003         for(q=0; q<52; q++){
2004             int shift = div6[q] + 2;
2005             int idx = rem6[q];
2006             for(x=0; x<16; x++)
2007                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2008                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2009                     h->pps.scaling_matrix4[i][x]) << shift;
2010         }
2011     }
2012 }
2013
2014 static void init_dequant_tables(H264Context *h){
2015     int i,x;
2016     init_dequant4_coeff_table(h);
2017     if(h->pps.transform_8x8_mode)
2018         init_dequant8_coeff_table(h);
2019     if(h->sps.transform_bypass){
2020         for(i=0; i<6; i++)
2021             for(x=0; x<16; x++)
2022                 h->dequant4_coeff[i][0][x] = 1<<6;
2023         if(h->pps.transform_8x8_mode)
2024             for(i=0; i<2; i++)
2025                 for(x=0; x<64; x++)
2026                     h->dequant8_coeff[i][0][x] = 1<<6;
2027     }
2028 }
2029
2030
2031 /**
2032  * allocates tables.
2033  * needs width/height
2034  */
2035 static int alloc_tables(H264Context *h){
2036     MpegEncContext * const s = &h->s;
2037     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2038     int x,y;
2039
2040     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2041
2042     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2043     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2044     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2045
2046     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2047     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2048     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2050
2051     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2052     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2053
2054     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2055     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2056     for(y=0; y<s->mb_height; y++){
2057         for(x=0; x<s->mb_width; x++){
2058             const int mb_xy= x + y*s->mb_stride;
2059             const int b_xy = 4*x + 4*y*h->b_stride;
2060             const int b8_xy= 2*x + 2*y*h->b8_stride;
2061
2062             h->mb2b_xy [mb_xy]= b_xy;
2063             h->mb2b8_xy[mb_xy]= b8_xy;
2064         }
2065     }
2066
2067     s->obmc_scratchpad = NULL;
2068
2069     if(!h->dequant4_coeff[0])
2070         init_dequant_tables(h);
2071
2072     return 0;
2073 fail:
2074     free_tables(h);
2075     return -1;
2076 }
2077
2078 /**
2079  * Mimic alloc_tables(), but for every context thread.
2080  */
2081 static void clone_tables(H264Context *dst, H264Context *src){
2082     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2083     dst->non_zero_count           = src->non_zero_count;
2084     dst->slice_table              = src->slice_table;
2085     dst->cbp_table                = src->cbp_table;
2086     dst->mb2b_xy                  = src->mb2b_xy;
2087     dst->mb2b8_xy                 = src->mb2b8_xy;
2088     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2089     dst->mvd_table[0]             = src->mvd_table[0];
2090     dst->mvd_table[1]             = src->mvd_table[1];
2091     dst->direct_table             = src->direct_table;
2092
2093     dst->s.obmc_scratchpad = NULL;
2094     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2095 }
2096
2097 /**
2098  * Init context
2099  * Allocate buffers which are not shared amongst multiple threads.
2100  */
2101 static int context_init(H264Context *h){
2102     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104
2105     return 0;
2106 fail:
2107     return -1; // free_tables will clean up for us
2108 }
2109
2110 static av_cold void common_init(H264Context *h){
2111     MpegEncContext * const s = &h->s;
2112
2113     s->width = s->avctx->width;
2114     s->height = s->avctx->height;
2115     s->codec_id= s->avctx->codec->id;
2116
2117     ff_h264_pred_init(&h->hpc, s->codec_id);
2118
2119     h->dequant_coeff_pps= -1;
2120     s->unrestricted_mv=1;
2121     s->decode=1; //FIXME
2122
2123     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2124
2125     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2126     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2127 }
2128
2129 static av_cold int decode_init(AVCodecContext *avctx){
2130     H264Context *h= avctx->priv_data;
2131     MpegEncContext * const s = &h->s;
2132
2133     MPV_decode_defaults(s);
2134
2135     s->avctx = avctx;
2136     common_init(h);
2137
2138     s->out_format = FMT_H264;
2139     s->workaround_bugs= avctx->workaround_bugs;
2140
2141     // set defaults
2142 //    s->decode_mb= ff_h263_decode_mb;
2143     s->quarter_sample = 1;
2144     s->low_delay= 1;
2145
2146     if(avctx->codec_id == CODEC_ID_SVQ3)
2147         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2148     else
2149         avctx->pix_fmt= PIX_FMT_YUV420P;
2150
2151     decode_init_vlc();
2152
2153     if(avctx->extradata_size > 0 && avctx->extradata &&
2154        *(char *)avctx->extradata == 1){
2155         h->is_avc = 1;
2156         h->got_avcC = 0;
2157     } else {
2158         h->is_avc = 0;
2159     }
2160
2161     h->thread_context[0] = h;
2162     h->outputed_poc = INT_MIN;
2163     h->prev_poc_msb= 1<<16;
2164     return 0;
2165 }
2166
2167 static int frame_start(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169     int i;
2170
2171     if(MPV_frame_start(s, s->avctx) < 0)
2172         return -1;
2173     ff_er_frame_start(s);
2174     /*
2175      * MPV_frame_start uses pict_type to derive key_frame.
2176      * This is incorrect for H.264; IDR markings must be used.
2177      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2178      * See decode_nal_units().
2179      */
2180     s->current_picture_ptr->key_frame= 0;
2181
2182     assert(s->linesize && s->uvlinesize);
2183
2184     for(i=0; i<16; i++){
2185         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2186         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2187     }
2188     for(i=0; i<4; i++){
2189         h->block_offset[16+i]=
2190         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2191         h->block_offset[24+16+i]=
2192         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2193     }
2194
2195     /* can't be in alloc_tables because linesize isn't known there.
2196      * FIXME: redo bipred weight to not require extra buffer? */
2197     for(i = 0; i < s->avctx->thread_count; i++)
2198         if(!h->thread_context[i]->s.obmc_scratchpad)
2199             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2200
2201     /* some macroblocks will be accessed before they're available */
2202     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2203         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2204
2205 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2206
2207     // We mark the current picture as non-reference after allocating it, so
2208     // that if we break out due to an error it can be released automatically
2209     // in the next MPV_frame_start().
2210     // SVQ3 as well as most other codecs have only last/next/current and thus
2211     // get released even with set reference, besides SVQ3 and others do not
2212     // mark frames as reference later "naturally".
2213     if(s->codec_id != CODEC_ID_SVQ3)
2214         s->current_picture_ptr->reference= 0;
2215
2216     s->current_picture_ptr->field_poc[0]=
2217     s->current_picture_ptr->field_poc[1]= INT_MAX;
2218     assert(s->current_picture_ptr->long_ref==0);
2219
2220     return 0;
2221 }
2222
2223 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2224     MpegEncContext * const s = &h->s;
2225     int i;
2226     int step    = 1;
2227     int offset  = 1;
2228     int uvoffset= 1;
2229     int top_idx = 1;
2230     int skiplast= 0;
2231
2232     src_y  -=   linesize;
2233     src_cb -= uvlinesize;
2234     src_cr -= uvlinesize;
2235
2236     if(!simple && FRAME_MBAFF){
2237         if(s->mb_y&1){
2238             offset  = MB_MBAFF ? 1 : 17;
2239             uvoffset= MB_MBAFF ? 1 : 9;
2240             if(!MB_MBAFF){
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2242                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2243                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2245                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2246                 }
2247             }
2248         }else{
2249             if(!MB_MBAFF){
2250                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2251                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2252                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2253                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2254                 }
2255                 skiplast= 1;
2256             }
2257             offset  =
2258             uvoffset=
2259             top_idx = MB_MBAFF ? 0 : 1;
2260         }
2261         step= MB_MBAFF ? 2 : 1;
2262     }
2263
2264     // There are two lines saved, the line above the the top macroblock of a pair,
2265     // and the line above the bottom macroblock
2266     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2267     for(i=1; i<17 - skiplast; i++){
2268         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2269     }
2270
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2272     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2273
2274     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2275         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2276         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2277         for(i=1; i<9 - skiplast; i++){
2278             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2279             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2280         }
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2282         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2283     }
2284 }
2285
2286 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2287     MpegEncContext * const s = &h->s;
2288     int temp8, i;
2289     uint64_t temp64;
2290     int deblock_left;
2291     int deblock_top;
2292     int mb_xy;
2293     int step    = 1;
2294     int offset  = 1;
2295     int uvoffset= 1;
2296     int top_idx = 1;
2297
2298     if(!simple && FRAME_MBAFF){
2299         if(s->mb_y&1){
2300             offset  = MB_MBAFF ? 1 : 17;
2301             uvoffset= MB_MBAFF ? 1 : 9;
2302         }else{
2303             offset  =
2304             uvoffset=
2305             top_idx = MB_MBAFF ? 0 : 1;
2306         }
2307         step= MB_MBAFF ? 2 : 1;
2308     }
2309
2310     if(h->deblocking_filter == 2) {
2311         mb_xy = h->mb_xy;
2312         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2313         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2314     } else {
2315         deblock_left = (s->mb_x > 0);
2316         deblock_top =  (s->mb_y > !!MB_FIELD);
2317     }
2318
2319     src_y  -=   linesize + 1;
2320     src_cb -= uvlinesize + 1;
2321     src_cr -= uvlinesize + 1;
2322
2323 #define XCHG(a,b,t,xchg)\
2324 t= a;\
2325 if(xchg)\
2326     a= b;\
2327 b= t;
2328
2329     if(deblock_left){
2330         for(i = !deblock_top; i<16; i++){
2331             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2332         }
2333         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2334     }
2335
2336     if(deblock_top){
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2338         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2339         if(s->mb_x+1 < s->mb_width){
2340             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2341         }
2342     }
2343
2344     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2345         if(deblock_left){
2346             for(i = !deblock_top; i<8; i++){
2347                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2348                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2349             }
2350             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2351             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2352         }
2353         if(deblock_top){
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2355             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2356         }
2357     }
2358 }
2359
2360 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2361     MpegEncContext * const s = &h->s;
2362     const int mb_x= s->mb_x;
2363     const int mb_y= s->mb_y;
2364     const int mb_xy= h->mb_xy;
2365     const int mb_type= s->current_picture.mb_type[mb_xy];
2366     uint8_t  *dest_y, *dest_cb, *dest_cr;
2367     int linesize, uvlinesize /*dct_offset*/;
2368     int i;
2369     int *block_offset = &h->block_offset[0];
2370     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2371     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2372     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2373     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2374
2375     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2376     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2377     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2378
2379     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2380     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2381
2382     if (!simple && MB_FIELD) {
2383         linesize   = h->mb_linesize   = s->linesize * 2;
2384         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2385         block_offset = &h->block_offset[24];
2386         if(mb_y&1){ //FIXME move out of this function?
2387             dest_y -= s->linesize*15;
2388             dest_cb-= s->uvlinesize*7;
2389             dest_cr-= s->uvlinesize*7;
2390         }
2391         if(FRAME_MBAFF) {
2392             int list;
2393             for(list=0; list<h->list_count; list++){
2394                 if(!USES_LIST(mb_type, list))
2395                     continue;
2396                 if(IS_16X16(mb_type)){
2397                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2398                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2399                 }else{
2400                     for(i=0; i<16; i+=4){
2401                         int ref = h->ref_cache[list][scan8[i]];
2402                         if(ref >= 0)
2403                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2404                     }
2405                 }
2406             }
2407         }
2408     } else {
2409         linesize   = h->mb_linesize   = s->linesize;
2410         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2411 //        dct_offset = s->linesize * 16;
2412     }
2413
2414     if (!simple && IS_INTRA_PCM(mb_type)) {
2415         for (i=0; i<16; i++) {
2416             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2417         }
2418         for (i=0; i<8; i++) {
2419             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2420             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2421         }
2422     } else {
2423         if(IS_INTRA(mb_type)){
2424             if(h->deblocking_filter)
2425                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2426
2427             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2428                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2429                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2430             }
2431
2432             if(IS_INTRA4x4(mb_type)){
2433                 if(simple || !s->encoding){
2434                     if(IS_8x8DCT(mb_type)){
2435                         if(transform_bypass){
2436                             idct_dc_add =
2437                             idct_add    = s->dsp.add_pixels8;
2438                         }else{
2439                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2440                             idct_add    = s->dsp.h264_idct8_add;
2441                         }
2442                         for(i=0; i<16; i+=4){
2443                             uint8_t * const ptr= dest_y + block_offset[i];
2444                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2445                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2446                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2447                             }else{
2448                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2449                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2450                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2451                                 if(nnz){
2452                                     if(nnz == 1 && h->mb[i*16])
2453                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2454                                     else
2455                                         idct_add   (ptr, h->mb + i*16, linesize);
2456                                 }
2457                             }
2458                         }
2459                     }else{
2460                         if(transform_bypass){
2461                             idct_dc_add =
2462                             idct_add    = s->dsp.add_pixels4;
2463                         }else{
2464                             idct_dc_add = s->dsp.h264_idct_dc_add;
2465                             idct_add    = s->dsp.h264_idct_add;
2466                         }
2467                         for(i=0; i<16; i++){
2468                             uint8_t * const ptr= dest_y + block_offset[i];
2469                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2470
2471                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2472                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2473                             }else{
2474                                 uint8_t *topright;
2475                                 int nnz, tr;
2476                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2477                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2478                                     assert(mb_y || linesize <= block_offset[i]);
2479                                     if(!topright_avail){
2480                                         tr= ptr[3 - linesize]*0x01010101;
2481                                         topright= (uint8_t*) &tr;
2482                                     }else
2483                                         topright= ptr + 4 - linesize;
2484                                 }else
2485                                     topright= NULL;
2486
2487                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2488                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2489                                 if(nnz){
2490                                     if(is_h264){
2491                                         if(nnz == 1 && h->mb[i*16])
2492                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2493                                         else
2494                                             idct_add   (ptr, h->mb + i*16, linesize);
2495                                     }else
2496                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2497                                 }
2498                             }
2499                         }
2500                     }
2501                 }
2502             }else{
2503                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2504                 if(is_h264){
2505                     if(!transform_bypass)
2506                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2507                 }else
2508                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2509             }
2510             if(h->deblocking_filter)
2511                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2512         }else if(is_h264){
2513             hl_motion(h, dest_y, dest_cb, dest_cr,
2514                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2515                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2516                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2517         }
2518
2519
2520         if(!IS_INTRA4x4(mb_type)){
2521             if(is_h264){
2522                 if(IS_INTRA16x16(mb_type)){
2523                     if(transform_bypass){
2524                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2525                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2526                         }else{
2527                             for(i=0; i<16; i++){
2528                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2529                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2530                             }
2531                         }
2532                     }else{
2533                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2534                     }
2535                 }else if(h->cbp&15){
2536                     if(transform_bypass){
2537                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2538                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2539                         for(i=0; i<16; i+=di){
2540                             if(h->non_zero_count_cache[ scan8[i] ]){
2541                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2542                             }
2543                         }
2544                     }else{
2545                         if(IS_8x8DCT(mb_type)){
2546                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2547                         }else{
2548                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2549                         }
2550                     }
2551                 }
2552             }else{
2553                 for(i=0; i<16; i++){
2554                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2555                         uint8_t * const ptr= dest_y + block_offset[i];
2556                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2557                     }
2558                 }
2559             }
2560         }
2561
2562         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2563             uint8_t *dest[2] = {dest_cb, dest_cr};
2564             if(transform_bypass){
2565                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2566                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2567                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2568                 }else{
2569                     idct_add = s->dsp.add_pixels4;
2570                     for(i=16; i<16+8; i++){
2571                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2572                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2573                     }
2574                 }
2575             }else{
2576                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2577                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2578                 if(is_h264){
2579                     idct_add = s->dsp.h264_idct_add;
2580                     idct_dc_add = s->dsp.h264_idct_dc_add;
2581                     for(i=16; i<16+8; i++){
2582                         if(h->non_zero_count_cache[ scan8[i] ])
2583                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2584                         else if(h->mb[i*16])
2585                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2586                     }
2587                 }else{
2588                     for(i=16; i<16+8; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2590                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2591                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2592                         }
2593                     }
2594                 }
2595             }
2596         }
2597     }
2598     if(h->deblocking_filter) {
2599         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2600         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2601         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2602         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2603         if (!simple && FRAME_MBAFF) {
2604             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2605         } else {
2606             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2607         }
2608     }
2609 }
2610
2611 /**
2612  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2613  */
2614 static void hl_decode_mb_simple(H264Context *h){
2615     hl_decode_mb_internal(h, 1);
2616 }
2617
2618 /**
2619  * Process a macroblock; this handles edge cases, such as interlacing.
2620  */
2621 static void av_noinline hl_decode_mb_complex(H264Context *h){
2622     hl_decode_mb_internal(h, 0);
2623 }
2624
2625 static void hl_decode_mb(H264Context *h){
2626     MpegEncContext * const s = &h->s;
2627     const int mb_xy= h->mb_xy;
2628     const int mb_type= s->current_picture.mb_type[mb_xy];
2629     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2630
2631     if(ENABLE_H264_ENCODER && !s->decode)
2632         return;
2633
2634     if (is_complex)
2635         hl_decode_mb_complex(h);
2636     else hl_decode_mb_simple(h);
2637 }
2638
2639 static void pic_as_field(Picture *pic, const int parity){
2640     int i;
2641     for (i = 0; i < 4; ++i) {
2642         if (parity == PICT_BOTTOM_FIELD)
2643             pic->data[i] += pic->linesize[i];
2644         pic->reference = parity;
2645         pic->linesize[i] *= 2;
2646     }
2647     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2648 }
2649
2650 static int split_field_copy(Picture *dest, Picture *src,
2651                             int parity, int id_add){
2652     int match = !!(src->reference & parity);
2653
2654     if (match) {
2655         *dest = *src;
2656         if(parity != PICT_FRAME){
2657             pic_as_field(dest, parity);
2658             dest->pic_id *= 2;
2659             dest->pic_id += id_add;
2660         }
2661     }
2662
2663     return match;
2664 }
2665
2666 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2667     int i[2]={0};
2668     int index=0;
2669
2670     while(i[0]<len || i[1]<len){
2671         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2672             i[0]++;
2673         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2674             i[1]++;
2675         if(i[0] < len){
2676             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2677             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2678         }
2679         if(i[1] < len){
2680             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2681             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2682         }
2683     }
2684
2685     return index;
2686 }
2687
2688 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2689     int i, best_poc;
2690     int out_i= 0;
2691
2692     for(;;){
2693         best_poc= dir ? INT_MIN : INT_MAX;
2694
2695         for(i=0; i<len; i++){
2696             const int poc= src[i]->poc;
2697             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2698                 best_poc= poc;
2699                 sorted[out_i]= src[i];
2700             }
2701         }
2702         if(best_poc == (dir ? INT_MIN : INT_MAX))
2703             break;
2704         limit= sorted[out_i++]->poc - dir;
2705     }
2706     return out_i;
2707 }
2708
2709 /**
2710  * fills the default_ref_list.
2711  */
2712 static int fill_default_ref_list(H264Context *h){
2713     MpegEncContext * const s = &h->s;
2714     int i, len;
2715
2716     if(h->slice_type_nos==FF_B_TYPE){
2717         Picture *sorted[32];
2718         int cur_poc, list;
2719         int lens[2];
2720
2721         if(FIELD_PICTURE)
2722             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2723         else
2724             cur_poc= s->current_picture_ptr->poc;
2725
2726         for(list= 0; list<2; list++){
2727             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2728             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2729             assert(len<=32);
2730             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2731             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2732             assert(len<=32);
2733
2734             if(len < h->ref_count[list])
2735                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2736             lens[list]= len;
2737         }
2738
2739         if(lens[0] == lens[1] && lens[1] > 1){
2740             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2741             if(i == lens[0])
2742                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2743         }
2744     }else{
2745         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2746         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2747         assert(len <= 32);
2748         if(len < h->ref_count[0])
2749             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2750     }
2751 #ifdef TRACE
2752     for (i=0; i<h->ref_count[0]; i++) {
2753         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2754     }
2755     if(h->slice_type_nos==FF_B_TYPE){
2756         for (i=0; i<h->ref_count[1]; i++) {
2757             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2758         }
2759     }
2760 #endif
2761     return 0;
2762 }
2763
2764 static void print_short_term(H264Context *h);
2765 static void print_long_term(H264Context *h);
2766
2767 /**
2768  * Extract structure information about the picture described by pic_num in
2769  * the current decoding context (frame or field). Note that pic_num is
2770  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2771  * @param pic_num picture number for which to extract structure information
2772  * @param structure one of PICT_XXX describing structure of picture
2773  *                      with pic_num
2774  * @return frame number (short term) or long term index of picture
2775  *         described by pic_num
2776  */
2777 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2778     MpegEncContext * const s = &h->s;
2779
2780     *structure = s->picture_structure;
2781     if(FIELD_PICTURE){
2782         if (!(pic_num & 1))
2783             /* opposite field */
2784             *structure ^= PICT_FRAME;
2785         pic_num >>= 1;
2786     }
2787
2788     return pic_num;
2789 }
2790
2791 static int decode_ref_pic_list_reordering(H264Context *h){
2792     MpegEncContext * const s = &h->s;
2793     int list, index, pic_structure;
2794
2795     print_short_term(h);
2796     print_long_term(h);
2797
2798     for(list=0; list<h->list_count; list++){
2799         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2800
2801         if(get_bits1(&s->gb)){
2802             int pred= h->curr_pic_num;
2803
2804             for(index=0; ; index++){
2805                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2806                 unsigned int pic_id;
2807                 int i;
2808                 Picture *ref = NULL;
2809
2810                 if(reordering_of_pic_nums_idc==3)
2811                     break;
2812
2813                 if(index >= h->ref_count[list]){
2814                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2815                     return -1;
2816                 }
2817
2818                 if(reordering_of_pic_nums_idc<3){
2819                     if(reordering_of_pic_nums_idc<2){
2820                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2821                         int frame_num;
2822
2823                         if(abs_diff_pic_num > h->max_pic_num){
2824                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2825                             return -1;
2826                         }
2827
2828                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2829                         else                                pred+= abs_diff_pic_num;
2830                         pred &= h->max_pic_num - 1;
2831
2832                         frame_num = pic_num_extract(h, pred, &pic_structure);
2833
2834                         for(i= h->short_ref_count-1; i>=0; i--){
2835                             ref = h->short_ref[i];
2836                             assert(ref->reference);
2837                             assert(!ref->long_ref);
2838                             if(
2839                                    ref->frame_num == frame_num &&
2840                                    (ref->reference & pic_structure)
2841                               )
2842                                 break;
2843                         }
2844                         if(i>=0)
2845                             ref->pic_id= pred;
2846                     }else{
2847                         int long_idx;
2848                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2849
2850                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2851
2852                         if(long_idx>31){
2853                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2854                             return -1;
2855                         }
2856                         ref = h->long_ref[long_idx];
2857                         assert(!(ref && !ref->reference));
2858                         if(ref && (ref->reference & pic_structure)){
2859                             ref->pic_id= pic_id;
2860                             assert(ref->long_ref);
2861                             i=0;
2862                         }else{
2863                             i=-1;
2864                         }
2865                     }
2866
2867                     if (i < 0) {
2868                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2869                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2870                     } else {
2871                         for(i=index; i+1<h->ref_count[list]; i++){
2872                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2873                                 break;
2874                         }
2875                         for(; i > index; i--){
2876                             h->ref_list[list][i]= h->ref_list[list][i-1];
2877                         }
2878                         h->ref_list[list][index]= *ref;
2879                         if (FIELD_PICTURE){
2880                             pic_as_field(&h->ref_list[list][index], pic_structure);
2881                         }
2882                     }
2883                 }else{
2884                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2885                     return -1;
2886                 }
2887             }
2888         }
2889     }
2890     for(list=0; list<h->list_count; list++){
2891         for(index= 0; index < h->ref_count[list]; index++){
2892             if(!h->ref_list[list][index].data[0]){
2893                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2894                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2895             }
2896         }
2897     }
2898
2899     return 0;
2900 }
2901
2902 static void fill_mbaff_ref_list(H264Context *h){
2903     int list, i, j;
2904     for(list=0; list<2; list++){ //FIXME try list_count
2905         for(i=0; i<h->ref_count[list]; i++){
2906             Picture *frame = &h->ref_list[list][i];
2907             Picture *field = &h->ref_list[list][16+2*i];
2908             field[0] = *frame;
2909             for(j=0; j<3; j++)
2910                 field[0].linesize[j] <<= 1;
2911             field[0].reference = PICT_TOP_FIELD;
2912             field[0].poc= field[0].field_poc[0];
2913             field[1] = field[0];
2914             for(j=0; j<3; j++)
2915                 field[1].data[j] += frame->linesize[j];
2916             field[1].reference = PICT_BOTTOM_FIELD;
2917             field[1].poc= field[1].field_poc[1];
2918
2919             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2920             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2921             for(j=0; j<2; j++){
2922                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2923                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2924             }
2925         }
2926     }
2927     for(j=0; j<h->ref_count[1]; j++){
2928         for(i=0; i<h->ref_count[0]; i++)
2929             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2930         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2931         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2932     }
2933 }
2934
2935 static int pred_weight_table(H264Context *h){
2936     MpegEncContext * const s = &h->s;
2937     int list, i;
2938     int luma_def, chroma_def;
2939
2940     h->use_weight= 0;
2941     h->use_weight_chroma= 0;
2942     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2943     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2944     luma_def = 1<<h->luma_log2_weight_denom;
2945     chroma_def = 1<<h->chroma_log2_weight_denom;
2946
2947     for(list=0; list<2; list++){
2948         for(i=0; i<h->ref_count[list]; i++){
2949             int luma_weight_flag, chroma_weight_flag;
2950
2951             luma_weight_flag= get_bits1(&s->gb);
2952             if(luma_weight_flag){
2953                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2954                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2955                 if(   h->luma_weight[list][i] != luma_def
2956                    || h->luma_offset[list][i] != 0)
2957                     h->use_weight= 1;
2958             }else{
2959                 h->luma_weight[list][i]= luma_def;
2960                 h->luma_offset[list][i]= 0;
2961             }
2962
2963             if(CHROMA){
2964                 chroma_weight_flag= get_bits1(&s->gb);
2965                 if(chroma_weight_flag){
2966                     int j;
2967                     for(j=0; j<2; j++){
2968                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2969                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2970                         if(   h->chroma_weight[list][i][j] != chroma_def
2971                         || h->chroma_offset[list][i][j] != 0)
2972                             h->use_weight_chroma= 1;
2973                     }
2974                 }else{
2975                     int j;
2976                     for(j=0; j<2; j++){
2977                         h->chroma_weight[list][i][j]= chroma_def;
2978                         h->chroma_offset[list][i][j]= 0;
2979                     }
2980                 }
2981             }
2982         }
2983         if(h->slice_type_nos != FF_B_TYPE) break;
2984     }
2985     h->use_weight= h->use_weight || h->use_weight_chroma;
2986     return 0;
2987 }
2988
2989 static void implicit_weight_table(H264Context *h){
2990     MpegEncContext * const s = &h->s;
2991     int ref0, ref1;
2992     int cur_poc = s->current_picture_ptr->poc;
2993
2994     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2995        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2996         h->use_weight= 0;
2997         h->use_weight_chroma= 0;
2998         return;
2999     }
3000
3001     h->use_weight= 2;
3002     h->use_weight_chroma= 2;
3003     h->luma_log2_weight_denom= 5;
3004     h->chroma_log2_weight_denom= 5;
3005
3006     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3007         int poc0 = h->ref_list[0][ref0].poc;
3008         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3009             int poc1 = h->ref_list[1][ref1].poc;
3010             int td = av_clip(poc1 - poc0, -128, 127);
3011             if(td){
3012                 int tb = av_clip(cur_poc - poc0, -128, 127);
3013                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3014                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3015                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3016                     h->implicit_weight[ref0][ref1] = 32;
3017                 else
3018                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3019             }else
3020                 h->implicit_weight[ref0][ref1] = 32;
3021         }
3022     }
3023 }
3024
3025 /**
3026  * Mark a picture as no longer needed for reference. The refmask
3027  * argument allows unreferencing of individual fields or the whole frame.
3028  * If the picture becomes entirely unreferenced, but is being held for
3029  * display purposes, it is marked as such.
3030  * @param refmask mask of fields to unreference; the mask is bitwise
3031  *                anded with the reference marking of pic
3032  * @return non-zero if pic becomes entirely unreferenced (except possibly
3033  *         for display purposes) zero if one of the fields remains in
3034  *         reference
3035  */
3036 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3037     int i;
3038     if (pic->reference &= refmask) {
3039         return 0;
3040     } else {
3041         for(i = 0; h->delayed_pic[i]; i++)
3042             if(pic == h->delayed_pic[i]){
3043                 pic->reference=DELAYED_PIC_REF;
3044                 break;
3045             }
3046         return 1;
3047     }
3048 }
3049
3050 /**
3051  * instantaneous decoder refresh.
3052  */
3053 static void idr(H264Context *h){
3054     int i;
3055
3056     for(i=0; i<16; i++){
3057         remove_long(h, i, 0);
3058     }
3059     assert(h->long_ref_count==0);
3060
3061     for(i=0; i<h->short_ref_count; i++){
3062         unreference_pic(h, h->short_ref[i], 0);
3063         h->short_ref[i]= NULL;
3064     }
3065     h->short_ref_count=0;
3066     h->prev_frame_num= 0;
3067     h->prev_frame_num_offset= 0;
3068     h->prev_poc_msb=
3069     h->prev_poc_lsb= 0;
3070 }
3071
3072 /* forget old pics after a seek */
3073 static void flush_dpb(AVCodecContext *avctx){
3074     H264Context *h= avctx->priv_data;
3075     int i;
3076     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3077         if(h->delayed_pic[i])
3078             h->delayed_pic[i]->reference= 0;
3079         h->delayed_pic[i]= NULL;
3080     }
3081     h->outputed_poc= INT_MIN;
3082     idr(h);
3083     if(h->s.current_picture_ptr)
3084         h->s.current_picture_ptr->reference= 0;
3085     h->s.first_field= 0;
3086     ff_mpeg_flush(avctx);
3087 }
3088
3089 /**
3090  * Find a Picture in the short term reference list by frame number.
3091  * @param frame_num frame number to search for
3092  * @param idx the index into h->short_ref where returned picture is found
3093  *            undefined if no picture found.
3094  * @return pointer to the found picture, or NULL if no pic with the provided
3095  *                 frame number is found
3096  */
3097 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3098     MpegEncContext * const s = &h->s;
3099     int i;
3100
3101     for(i=0; i<h->short_ref_count; i++){
3102         Picture *pic= h->short_ref[i];
3103         if(s->avctx->debug&FF_DEBUG_MMCO)
3104             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3105         if(pic->frame_num == frame_num) {
3106             *idx = i;
3107             return pic;
3108         }
3109     }
3110     return NULL;
3111 }
3112
3113 /**
3114  * Remove a picture from the short term reference list by its index in
3115  * that list.  This does no checking on the provided index; it is assumed
3116  * to be valid. Other list entries are shifted down.
3117  * @param i index into h->short_ref of picture to remove.
3118  */
3119 static void remove_short_at_index(H264Context *h, int i){
3120     assert(i >= 0 && i < h->short_ref_count);
3121     h->short_ref[i]= NULL;
3122     if (--h->short_ref_count)
3123         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3124 }
3125
3126 /**
3127  *
3128  * @return the removed picture or NULL if an error occurs
3129  */
3130 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3131     MpegEncContext * const s = &h->s;
3132     Picture *pic;
3133     int i;
3134
3135     if(s->avctx->debug&FF_DEBUG_MMCO)
3136         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3137
3138     pic = find_short(h, frame_num, &i);
3139     if (pic){
3140         if(unreference_pic(h, pic, ref_mask))
3141         remove_short_at_index(h, i);
3142     }
3143
3144     return pic;
3145 }
3146
3147 /**
3148  * Remove a picture from the long term reference list by its index in
3149  * that list.
3150  * @return the removed picture or NULL if an error occurs
3151  */
3152 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3153     Picture *pic;
3154
3155     pic= h->long_ref[i];
3156     if (pic){
3157         if(unreference_pic(h, pic, ref_mask)){
3158             assert(h->long_ref[i]->long_ref == 1);
3159             h->long_ref[i]->long_ref= 0;
3160             h->long_ref[i]= NULL;
3161             h->long_ref_count--;
3162         }
3163     }
3164
3165     return pic;
3166 }
3167
3168 /**
3169  * print short term list
3170  */
3171 static void print_short_term(H264Context *h) {
3172     uint32_t i;
3173     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3174         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3175         for(i=0; i<h->short_ref_count; i++){
3176             Picture *pic= h->short_ref[i];
3177             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3178         }
3179     }
3180 }
3181
3182 /**
3183  * print long term list
3184  */
3185 static void print_long_term(H264Context *h) {
3186     uint32_t i;
3187     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3188         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3189         for(i = 0; i < 16; i++){
3190             Picture *pic= h->long_ref[i];
3191             if (pic) {
3192                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3193             }
3194         }
3195     }
3196 }
3197
3198 /**
3199  * Executes the reference picture marking (memory management control operations).
3200  */
3201 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3202     MpegEncContext * const s = &h->s;
3203     int i, j;
3204     int current_ref_assigned=0;
3205     Picture *pic;
3206
3207     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3208         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3209
3210     for(i=0; i<mmco_count; i++){
3211         int structure, frame_num;
3212         if(s->avctx->debug&FF_DEBUG_MMCO)
3213             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3214
3215         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3216            || mmco[i].opcode == MMCO_SHORT2LONG){
3217             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3218             pic = find_short(h, frame_num, &j);
3219             if(!pic){
3220                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3221                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3222                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3223                 continue;
3224             }
3225         }
3226
3227         switch(mmco[i].opcode){
3228         case MMCO_SHORT2UNUSED:
3229             if(s->avctx->debug&FF_DEBUG_MMCO)
3230                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3231             remove_short(h, frame_num, structure ^ PICT_FRAME);
3232             break;
3233         case MMCO_SHORT2LONG:
3234                 if (h->long_ref[mmco[i].long_arg] != pic)
3235                     remove_long(h, mmco[i].long_arg, 0);
3236
3237                 remove_short_at_index(h, j);
3238                 h->long_ref[ mmco[i].long_arg ]= pic;
3239                 if (h->long_ref[ mmco[i].long_arg ]){
3240                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3241                     h->long_ref_count++;
3242                 }
3243             break;
3244         case MMCO_LONG2UNUSED:
3245             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3246             pic = h->long_ref[j];
3247             if (pic) {
3248                 remove_long(h, j, structure ^ PICT_FRAME);
3249             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3250                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3251             break;
3252         case MMCO_LONG:
3253                     // Comment below left from previous code as it is an interresting note.
3254                     /* First field in pair is in short term list or
3255                      * at a different long term index.
3256                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3257                      * Report the problem and keep the pair where it is,
3258                      * and mark this field valid.
3259                      */
3260
3261             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3262                 remove_long(h, mmco[i].long_arg, 0);
3263
3264                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3265                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3266                 h->long_ref_count++;
3267             }
3268
3269             s->current_picture_ptr->reference |= s->picture_structure;
3270             current_ref_assigned=1;
3271             break;
3272         case MMCO_SET_MAX_LONG:
3273             assert(mmco[i].long_arg <= 16);
3274             // just remove the long term which index is greater than new max
3275             for(j = mmco[i].long_arg; j<16; j++){
3276                 remove_long(h, j, 0);
3277             }
3278             break;
3279         case MMCO_RESET:
3280             while(h->short_ref_count){
3281                 remove_short(h, h->short_ref[0]->frame_num, 0);
3282             }
3283             for(j = 0; j < 16; j++) {
3284                 remove_long(h, j, 0);
3285             }
3286             s->current_picture_ptr->poc=
3287             s->current_picture_ptr->field_poc[0]=
3288             s->current_picture_ptr->field_poc[1]=
3289             h->poc_lsb=
3290             h->poc_msb=
3291             h->frame_num=
3292             s->current_picture_ptr->frame_num= 0;
3293             break;
3294         default: assert(0);
3295         }
3296     }
3297
3298     if (!current_ref_assigned) {
3299         /* Second field of complementary field pair; the first field of
3300          * which is already referenced. If short referenced, it
3301          * should be first entry in short_ref. If not, it must exist
3302          * in long_ref; trying to put it on the short list here is an
3303          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3304          */
3305         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3306             /* Just mark the second field valid */
3307             s->current_picture_ptr->reference = PICT_FRAME;
3308         } else if (s->current_picture_ptr->long_ref) {
3309             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3310                                              "assignment for second field "
3311                                              "in complementary field pair "
3312                                              "(first field is long term)\n");
3313         } else {
3314             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3315             if(pic){
3316                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3317             }
3318
3319             if(h->short_ref_count)
3320                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3321
3322             h->short_ref[0]= s->current_picture_ptr;
3323             h->short_ref_count++;
3324             s->current_picture_ptr->reference |= s->picture_structure;
3325         }
3326     }
3327
3328     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3329
3330         /* We have too many reference frames, probably due to corrupted
3331          * stream. Need to discard one frame. Prevents overrun of the
3332          * short_ref and long_ref buffers.
3333          */
3334         av_log(h->s.avctx, AV_LOG_ERROR,
3335                "number of reference frames exceeds max (probably "
3336                "corrupt input), discarding one\n");
3337
3338         if (h->long_ref_count && !h->short_ref_count) {
3339             for (i = 0; i < 16; ++i)
3340                 if (h->long_ref[i])
3341                     break;
3342
3343             assert(i < 16);
3344             remove_long(h, i, 0);
3345         } else {
3346             pic = h->short_ref[h->short_ref_count - 1];
3347             remove_short(h, pic->frame_num, 0);
3348         }
3349     }
3350
3351     print_short_term(h);
3352     print_long_term(h);
3353     return 0;
3354 }
3355
3356 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3357     MpegEncContext * const s = &h->s;
3358     int i;
3359
3360     h->mmco_index= 0;
3361     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3362         s->broken_link= get_bits1(gb) -1;
3363         if(get_bits1(gb)){
3364             h->mmco[0].opcode= MMCO_LONG;
3365             h->mmco[0].long_arg= 0;
3366             h->mmco_index= 1;
3367         }
3368     }else{
3369         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3370             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3371                 MMCOOpcode opcode= get_ue_golomb(gb);
3372
3373                 h->mmco[i].opcode= opcode;
3374                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3375                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3376 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3377                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3378                         return -1;
3379                     }*/
3380                 }
3381                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3382                     unsigned int long_arg= get_ue_golomb(gb);
3383                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3384                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3385                         return -1;
3386                     }
3387                     h->mmco[i].long_arg= long_arg;
3388                 }
3389
3390                 if(opcode > (unsigned)MMCO_LONG){
3391                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3392                     return -1;
3393                 }
3394                 if(opcode == MMCO_END)
3395                     break;
3396             }
3397             h->mmco_index= i;
3398         }else{
3399             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3400
3401             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3402                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3403                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3404                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3405                 h->mmco_index= 1;
3406                 if (FIELD_PICTURE) {
3407                     h->mmco[0].short_pic_num *= 2;
3408                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3409                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3410                     h->mmco_index= 2;
3411                 }
3412             }
3413         }
3414     }
3415
3416     return 0;
3417 }
3418
3419 static int init_poc(H264Context *h){
3420     MpegEncContext * const s = &h->s;
3421     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3422     int field_poc[2];
3423     Picture *cur = s->current_picture_ptr;
3424
3425     h->frame_num_offset= h->prev_frame_num_offset;
3426     if(h->frame_num < h->prev_frame_num)
3427         h->frame_num_offset += max_frame_num;
3428
3429     if(h->sps.poc_type==0){
3430         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3431
3432         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3433             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3434         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3435             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3436         else
3437             h->poc_msb = h->prev_poc_msb;
3438 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3439         field_poc[0] =
3440         field_poc[1] = h->poc_msb + h->poc_lsb;
3441         if(s->picture_structure == PICT_FRAME)
3442             field_poc[1] += h->delta_poc_bottom;
3443     }else if(h->sps.poc_type==1){
3444         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3445         int i;
3446
3447         if(h->sps.poc_cycle_length != 0)
3448             abs_frame_num = h->frame_num_offset + h->frame_num;
3449         else
3450             abs_frame_num = 0;
3451
3452         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3453             abs_frame_num--;
3454
3455         expected_delta_per_poc_cycle = 0;
3456         for(i=0; i < h->sps.poc_cycle_length; i++)
3457             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3458
3459         if(abs_frame_num > 0){
3460             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3461             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3462
3463             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3464             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3465                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3466         } else
3467             expectedpoc = 0;
3468
3469         if(h->nal_ref_idc == 0)
3470             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3471
3472         field_poc[0] = expectedpoc + h->delta_poc[0];
3473         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3474
3475         if(s->picture_structure == PICT_FRAME)
3476             field_poc[1] += h->delta_poc[1];
3477     }else{
3478         int poc= 2*(h->frame_num_offset + h->frame_num);
3479
3480         if(!h->nal_ref_idc)
3481             poc--;
3482
3483         field_poc[0]= poc;
3484         field_poc[1]= poc;
3485     }
3486
3487     if(s->picture_structure != PICT_BOTTOM_FIELD)
3488         s->current_picture_ptr->field_poc[0]= field_poc[0];
3489     if(s->picture_structure != PICT_TOP_FIELD)
3490         s->current_picture_ptr->field_poc[1]= field_poc[1];
3491     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3492
3493     return 0;
3494 }
3495
3496
3497 /**
3498  * initialize scan tables
3499  */
3500 static void init_scan_tables(H264Context *h){
3501     MpegEncContext * const s = &h->s;
3502     int i;
3503     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3504         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3505         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3506     }else{
3507         for(i=0; i<16; i++){
3508 #define T(x) (x>>2) | ((x<<2) & 0xF)
3509             h->zigzag_scan[i] = T(zigzag_scan[i]);
3510             h-> field_scan[i] = T( field_scan[i]);
3511 #undef T
3512         }
3513     }
3514     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3515         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3516         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3517         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3518         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3519     }else{
3520         for(i=0; i<64; i++){
3521 #define T(x) (x>>3) | ((x&7)<<3)
3522             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3523             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3524             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3525             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3526 #undef T
3527         }
3528     }
3529     if(h->sps.transform_bypass){ //FIXME same ugly
3530         h->zigzag_scan_q0          = zigzag_scan;
3531         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3532         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3533         h->field_scan_q0           = field_scan;
3534         h->field_scan8x8_q0        = field_scan8x8;
3535         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3536     }else{
3537         h->zigzag_scan_q0          = h->zigzag_scan;
3538         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3539         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3540         h->field_scan_q0           = h->field_scan;
3541         h->field_scan8x8_q0        = h->field_scan8x8;
3542         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3543     }
3544 }
3545
3546 /**
3547  * Replicates H264 "master" context to thread contexts.
3548  */
3549 static void clone_slice(H264Context *dst, H264Context *src)
3550 {
3551     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3552     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3553     dst->s.current_picture      = src->s.current_picture;
3554     dst->s.linesize             = src->s.linesize;
3555     dst->s.uvlinesize           = src->s.uvlinesize;
3556     dst->s.first_field          = src->s.first_field;
3557
3558     dst->prev_poc_msb           = src->prev_poc_msb;
3559     dst->prev_poc_lsb           = src->prev_poc_lsb;
3560     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3561     dst->prev_frame_num         = src->prev_frame_num;
3562     dst->short_ref_count        = src->short_ref_count;
3563
3564     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3565     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3566     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3567     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3568
3569     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3570     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3571 }
3572
3573 /**
3574  * decodes a slice header.
3575  * This will also call MPV_common_init() and frame_start() as needed.
3576  *
3577  * @param h h264context
3578  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3579  *
3580  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3581  */
3582 static int decode_slice_header(H264Context *h, H264Context *h0){
3583     MpegEncContext * const s = &h->s;
3584     MpegEncContext * const s0 = &h0->s;
3585     unsigned int first_mb_in_slice;
3586     unsigned int pps_id;
3587     int num_ref_idx_active_override_flag;
3588     unsigned int slice_type, tmp, i, j;
3589     int default_ref_list_done = 0;
3590     int last_pic_structure;
3591
3592     s->dropable= h->nal_ref_idc == 0;
3593
3594     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3595         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3596         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3597     }else{
3598         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3599         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3600     }
3601
3602     first_mb_in_slice= get_ue_golomb(&s->gb);
3603
3604     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3605         h0->current_slice = 0;
3606         if (!s0->first_field)
3607             s->current_picture_ptr= NULL;
3608     }
3609
3610     slice_type= get_ue_golomb(&s->gb);
3611     if(slice_type > 9){
3612         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3613         return -1;
3614     }
3615     if(slice_type > 4){
3616         slice_type -= 5;
3617         h->slice_type_fixed=1;
3618     }else
3619         h->slice_type_fixed=0;
3620
3621     slice_type= golomb_to_pict_type[ slice_type ];
3622     if (slice_type == FF_I_TYPE
3623         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3624         default_ref_list_done = 1;
3625     }
3626     h->slice_type= slice_type;
3627     h->slice_type_nos= slice_type & 3;
3628
3629     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3630     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3631         av_log(h->s.avctx, AV_LOG_ERROR,
3632                "B picture before any references, skipping\n");
3633         return -1;
3634     }
3635
3636     pps_id= get_ue_golomb(&s->gb);
3637     if(pps_id>=MAX_PPS_COUNT){
3638         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3639         return -1;
3640     }
3641     if(!h0->pps_buffers[pps_id]) {
3642         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3643         return -1;
3644     }
3645     h->pps= *h0->pps_buffers[pps_id];
3646
3647     if(!h0->sps_buffers[h->pps.sps_id]) {
3648         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3649         return -1;
3650     }
3651     h->sps = *h0->sps_buffers[h->pps.sps_id];
3652
3653     if(h == h0 && h->dequant_coeff_pps != pps_id){
3654         h->dequant_coeff_pps = pps_id;
3655         init_dequant_tables(h);
3656     }
3657
3658     s->mb_width= h->sps.mb_width;
3659     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3660
3661     h->b_stride=  s->mb_width*4;
3662     h->b8_stride= s->mb_width*2;
3663
3664     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3665     if(h->sps.frame_mbs_only_flag)
3666         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3667     else
3668         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3669
3670     if (s->context_initialized
3671         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3672         if(h != h0)
3673             return -1;   // width / height changed during parallelized decoding
3674         free_tables(h);
3675         flush_dpb(s->avctx);
3676         MPV_common_end(s);
3677     }
3678     if (!s->context_initialized) {
3679         if(h != h0)
3680             return -1;  // we cant (re-)initialize context during parallel decoding
3681         if (MPV_common_init(s) < 0)
3682             return -1;
3683         s->first_field = 0;
3684
3685         init_scan_tables(h);
3686         alloc_tables(h);
3687
3688         for(i = 1; i < s->avctx->thread_count; i++) {
3689             H264Context *c;
3690             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3691             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3692             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3693             c->sps = h->sps;
3694             c->pps = h->pps;
3695             init_scan_tables(c);
3696             clone_tables(c, h);
3697         }
3698
3699         for(i = 0; i < s->avctx->thread_count; i++)
3700             if(context_init(h->thread_context[i]) < 0)
3701                 return -1;
3702
3703         s->avctx->width = s->width;
3704         s->avctx->height = s->height;
3705         s->avctx->sample_aspect_ratio= h->sps.sar;
3706         if(!s->avctx->sample_aspect_ratio.den)
3707             s->avctx->sample_aspect_ratio.den = 1;
3708
3709         if(h->sps.timing_info_present_flag){
3710             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3711             if(h->x264_build > 0 && h->x264_build < 44)
3712                 s->avctx->time_base.den *= 2;
3713             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3714                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3715         }
3716     }
3717
3718     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3719
3720     h->mb_mbaff = 0;
3721     h->mb_aff_frame = 0;
3722     last_pic_structure = s0->picture_structure;
3723     if(h->sps.frame_mbs_only_flag){
3724         s->picture_structure= PICT_FRAME;
3725     }else{
3726         if(get_bits1(&s->gb)) { //field_pic_flag
3727             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3728         } else {
3729             s->picture_structure= PICT_FRAME;
3730             h->mb_aff_frame = h->sps.mb_aff;
3731         }
3732     }
3733     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3734
3735     if(h0->current_slice == 0){
3736         while(h->frame_num !=  h->prev_frame_num &&
3737               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3738             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3739             frame_start(h);
3740             h->prev_frame_num++;
3741             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3742             s->current_picture_ptr->frame_num= h->prev_frame_num;
3743             execute_ref_pic_marking(h, NULL, 0);
3744         }
3745
3746         /* See if we have a decoded first field looking for a pair... */
3747         if (s0->first_field) {
3748             assert(s0->current_picture_ptr);
3749             assert(s0->current_picture_ptr->data[0]);
3750             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3751
3752             /* figure out if we have a complementary field pair */
3753             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3754                 /*
3755                  * Previous field is unmatched. Don't display it, but let it
3756                  * remain for reference if marked as such.
3757                  */
3758                 s0->current_picture_ptr = NULL;
3759                 s0->first_field = FIELD_PICTURE;
3760
3761             } else {
3762                 if (h->nal_ref_idc &&
3763                         s0->current_picture_ptr->reference &&
3764                         s0->current_picture_ptr->frame_num != h->frame_num) {
3765                     /*
3766                      * This and previous field were reference, but had
3767                      * different frame_nums. Consider this field first in
3768                      * pair. Throw away previous field except for reference
3769                      * purposes.
3770                      */
3771                     s0->first_field = 1;
3772                     s0->current_picture_ptr = NULL;
3773
3774                 } else {
3775                     /* Second field in complementary pair */
3776                     s0->first_field = 0;
3777                 }
3778             }
3779
3780         } else {
3781             /* Frame or first field in a potentially complementary pair */
3782             assert(!s0->current_picture_ptr);
3783             s0->first_field = FIELD_PICTURE;
3784         }
3785
3786         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3787             s0->first_field = 0;
3788             return -1;
3789         }
3790     }
3791     if(h != h0)
3792         clone_slice(h, h0);
3793
3794     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3795
3796     assert(s->mb_num == s->mb_width * s->mb_height);
3797     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3798        first_mb_in_slice                    >= s->mb_num){
3799         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3800         return -1;
3801     }
3802     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3803     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3804     if (s->picture_structure == PICT_BOTTOM_FIELD)
3805         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3806     assert(s->mb_y < s->mb_height);
3807
3808     if(s->picture_structure==PICT_FRAME){
3809         h->curr_pic_num=   h->frame_num;
3810         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3811     }else{
3812         h->curr_pic_num= 2*h->frame_num + 1;
3813         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3814     }
3815
3816     if(h->nal_unit_type == NAL_IDR_SLICE){
3817         get_ue_golomb(&s->gb); /* idr_pic_id */
3818     }
3819
3820     if(h->sps.poc_type==0){
3821         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3822
3823         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3824             h->delta_poc_bottom= get_se_golomb(&s->gb);
3825         }
3826     }
3827
3828     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3829         h->delta_poc[0]= get_se_golomb(&s->gb);
3830
3831         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3832             h->delta_poc[1]= get_se_golomb(&s->gb);
3833     }
3834
3835     init_poc(h);
3836
3837     if(h->pps.redundant_pic_cnt_present){
3838         h->redundant_pic_count= get_ue_golomb(&s->gb);
3839     }
3840
3841     //set defaults, might be overridden a few lines later
3842     h->ref_count[0]= h->pps.ref_count[0];
3843     h->ref_count[1]= h->pps.ref_count[1];
3844
3845     if(h->slice_type_nos != FF_I_TYPE){
3846         if(h->slice_type_nos == FF_B_TYPE){
3847             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3848         }
3849         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3850
3851         if(num_ref_idx_active_override_flag){
3852             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3853             if(h->slice_type_nos==FF_B_TYPE)
3854                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3855
3856             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3857                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3858                 h->ref_count[0]= h->ref_count[1]= 1;
3859                 return -1;
3860             }
3861         }
3862         if(h->slice_type_nos == FF_B_TYPE)
3863             h->list_count= 2;
3864         else
3865             h->list_count= 1;
3866     }else
3867         h->list_count= 0;
3868
3869     if(!default_ref_list_done){
3870         fill_default_ref_list(h);
3871     }
3872
3873     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3874         return -1;
3875
3876     if(h->slice_type_nos!=FF_I_TYPE){
3877         s->last_picture_ptr= &h->ref_list[0][0];
3878         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3879     }
3880     if(h->slice_type_nos==FF_B_TYPE){
3881         s->next_picture_ptr= &h->ref_list[1][0];
3882         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3883     }
3884
3885     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3886        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3887         pred_weight_table(h);
3888     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3889         implicit_weight_table(h);
3890     else
3891         h->use_weight = 0;
3892
3893     if(h->nal_ref_idc)
3894         decode_ref_pic_marking(h0, &s->gb);
3895
3896     if(FRAME_MBAFF)
3897         fill_mbaff_ref_list(h);
3898
3899     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3900         direct_dist_scale_factor(h);
3901     direct_ref_list_init(h);
3902
3903     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3904         tmp = get_ue_golomb(&s->gb);
3905         if(tmp > 2){
3906             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3907             return -1;
3908         }
3909         h->cabac_init_idc= tmp;
3910     }
3911
3912     h->last_qscale_diff = 0;
3913     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3914     if(tmp>51){
3915         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3916         return -1;
3917     }
3918     s->qscale= tmp;
3919     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3920     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3921     //FIXME qscale / qp ... stuff
3922     if(h->slice_type == FF_SP_TYPE){
3923         get_bits1(&s->gb); /* sp_for_switch_flag */
3924     }
3925     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3926         get_se_golomb(&s->gb); /* slice_qs_delta */
3927     }
3928
3929     h->deblocking_filter = 1;
3930     h->slice_alpha_c0_offset = 0;
3931     h->slice_beta_offset = 0;
3932     if( h->pps.deblocking_filter_parameters_present ) {
3933         tmp= get_ue_golomb(&s->gb);
3934         if(tmp > 2){
3935             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3936             return -1;
3937         }
3938         h->deblocking_filter= tmp;
3939         if(h->deblocking_filter < 2)
3940             h->deblocking_filter^= 1; // 1<->0
3941
3942         if( h->deblocking_filter ) {
3943             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3944             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3945         }
3946     }
3947
3948     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3949        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3950        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3951        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3952         h->deblocking_filter= 0;
3953
3954     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3955         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3956             /* Cheat slightly for speed:
3957                Do not bother to deblock across slices. */
3958             h->deblocking_filter = 2;
3959         } else {
3960             h0->max_contexts = 1;
3961             if(!h0->single_decode_warning) {
3962                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3963                 h0->single_decode_warning = 1;
3964             }
3965             if(h != h0)
3966                 return 1; // deblocking switched inside frame
3967         }
3968     }
3969
3970 #if 0 //FMO
3971     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3972         slice_group_change_cycle= get_bits(&s->gb, ?);
3973 #endif
3974
3975     h0->last_slice_type = slice_type;
3976     h->slice_num = ++h0->current_slice;
3977     if(h->slice_num >= MAX_SLICES){
3978         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3979     }
3980
3981     for(j=0; j<2; j++){
3982         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3983         ref2frm[0]=
3984         ref2frm[1]= -1;
3985         for(i=0; i<16; i++)
3986             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3987                           +(h->ref_list[j][i].reference&3);
3988         ref2frm[18+0]=
3989         ref2frm[18+1]= -1;
3990         for(i=16; i<48; i++)
3991             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3992                           +(h->ref_list[j][i].reference&3);
3993     }
3994
3995     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3996     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3997
3998     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3999         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4000                h->slice_num,
4001                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4002                first_mb_in_slice,
4003                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4004                pps_id, h->frame_num,
4005                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4006                h->ref_count[0], h->ref_count[1],
4007                s->qscale,
4008                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4009                h->use_weight,
4010                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4011                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4012                );
4013     }
4014
4015     return 0;
4016 }
4017
4018 /**
4019  *
4020  */
4021 static inline int get_level_prefix(GetBitContext *gb){
4022     unsigned int buf;
4023     int log;
4024
4025     OPEN_READER(re, gb);
4026     UPDATE_CACHE(re, gb);
4027     buf=GET_CACHE(re, gb);
4028
4029     log= 32 - av_log2(buf);
4030 #ifdef TRACE
4031     print_bin(buf>>(32-log), log);
4032     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4033 #endif
4034
4035     LAST_SKIP_BITS(re, gb, log);
4036     CLOSE_READER(re, gb);
4037
4038     return log-1;
4039 }
4040
4041 static inline int get_dct8x8_allowed(H264Context *h){
4042     int i;
4043     for(i=0; i<4; i++){
4044         if(!IS_SUB_8X8(h->sub_mb_type[i])
4045            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4046             return 0;
4047     }
4048     return 1;
4049 }
4050
4051 /**
4052  * decodes a residual block.
4053  * @param n block index
4054  * @param scantable scantable
4055  * @param max_coeff number of coefficients in the block
4056  * @return <0 if an error occurred
4057  */
4058 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4059     MpegEncContext * const s = &h->s;
4060     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4061     int level[16];
4062     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4063
4064     //FIXME put trailing_onex into the context
4065
4066     if(n == CHROMA_DC_BLOCK_INDEX){
4067         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4068         total_coeff= coeff_token>>2;
4069     }else{
4070         if(n == LUMA_DC_BLOCK_INDEX){
4071             total_coeff= pred_non_zero_count(h, 0);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074         }else{
4075             total_coeff= pred_non_zero_count(h, n);
4076             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4077             total_coeff= coeff_token>>2;
4078             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4079         }
4080     }
4081
4082     //FIXME set last_non_zero?
4083
4084     if(total_coeff==0)
4085         return 0;
4086     if(total_coeff > (unsigned)max_coeff) {
4087         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4088         return -1;
4089     }
4090
4091     trailing_ones= coeff_token&3;
4092     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4093     assert(total_coeff<=16);
4094
4095     i = show_bits(gb, 3);
4096     skip_bits(gb, trailing_ones);
4097     level[0] = 1-((i&4)>>1);
4098     level[1] = 1-((i&2)   );
4099     level[2] = 1-((i&1)<<1);
4100
4101     if(trailing_ones<total_coeff) {
4102         int level_code, mask;
4103         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4104         int prefix= get_level_prefix(gb);
4105
4106         //first coefficient has suffix_length equal to 0 or 1
4107         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4108             if(suffix_length)
4109                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4110             else
4111                 level_code= (prefix<<suffix_length); //part
4112         }else if(prefix==14){
4113             if(suffix_length)
4114                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4115             else
4116                 level_code= prefix + get_bits(gb, 4); //part
4117         }else{
4118             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4119             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4120             if(prefix>=16)
4121                 level_code += (1<<(prefix-3))-4096;
4122         }
4123
4124         if(trailing_ones < 3) level_code += 2;
4125
4126         suffix_length = 1;
4127         if(level_code > 5)
4128             suffix_length++;
4129         mask= -(level_code&1);
4130         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4131
4132         //remaining coefficients have suffix_length > 0
4133         for(i=trailing_ones+1;i<total_coeff;i++) {
4134             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4135             prefix = get_level_prefix(gb);
4136             if(prefix<15){
4137                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4138             }else{
4139                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4140                 if(prefix>=16)
4141                     level_code += (1<<(prefix-3))-4096;
4142             }
4143             mask= -(level_code&1);
4144             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4145             if(level_code > suffix_limit[suffix_length])
4146                 suffix_length++;
4147         }
4148     }
4149
4150     if(total_coeff == max_coeff)
4151         zeros_left=0;
4152     else{
4153         if(n == CHROMA_DC_BLOCK_INDEX)
4154             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4155         else
4156             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4157     }
4158
4159     coeff_num = zeros_left + total_coeff - 1;
4160     j = scantable[coeff_num];
4161     if(n > 24){
4162         block[j] = level[0];
4163         for(i=1;i<total_coeff;i++) {
4164             if(zeros_left <= 0)
4165                 run_before = 0;
4166             else if(zeros_left < 7){
4167                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4168             }else{
4169                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4170             }
4171             zeros_left -= run_before;
4172             coeff_num -= 1 + run_before;
4173             j= scantable[ coeff_num ];
4174
4175             block[j]= level[i];
4176         }
4177     }else{
4178         block[j] = (level[0] * qmul[j] + 32)>>6;
4179         for(i=1;i<total_coeff;i++) {
4180             if(zeros_left <= 0)
4181                 run_before = 0;
4182             else if(zeros_left < 7){
4183                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4184             }else{
4185                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4186             }
4187             zeros_left -= run_before;
4188             coeff_num -= 1 + run_before;
4189             j= scantable[ coeff_num ];
4190
4191             block[j]= (level[i] * qmul[j] + 32)>>6;
4192         }
4193     }
4194
4195     if(zeros_left<0){
4196         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4197         return -1;
4198     }
4199
4200     return 0;
4201 }
4202
4203 static void predict_field_decoding_flag(H264Context *h){
4204     MpegEncContext * const s = &h->s;
4205     const int mb_xy= h->mb_xy;
4206     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4207                 ? s->current_picture.mb_type[mb_xy-1]
4208                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4209                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4210                 : 0;
4211     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4212 }
4213
4214 /**
4215  * decodes a P_SKIP or B_SKIP macroblock
4216  */
4217 static void decode_mb_skip(H264Context *h){
4218     MpegEncContext * const s = &h->s;
4219     const int mb_xy= h->mb_xy;
4220     int mb_type=0;
4221
4222     memset(h->non_zero_count[mb_xy], 0, 16);
4223     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4224
4225     if(MB_FIELD)
4226         mb_type|= MB_TYPE_INTERLACED;
4227
4228     if( h->slice_type_nos == FF_B_TYPE )
4229     {
4230         // just for fill_caches. pred_direct_motion will set the real mb_type
4231         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4232
4233         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4234         pred_direct_motion(h, &mb_type);
4235         mb_type|= MB_TYPE_SKIP;
4236     }
4237     else
4238     {
4239         int mx, my;
4240         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4241
4242         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4243         pred_pskip_motion(h, &mx, &my);
4244         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4245         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4246     }
4247
4248     write_back_motion(h, mb_type);
4249     s->current_picture.mb_type[mb_xy]= mb_type;
4250     s->current_picture.qscale_table[mb_xy]= s->qscale;
4251     h->slice_table[ mb_xy ]= h->slice_num;
4252     h->prev_mb_skipped= 1;
4253 }
4254
4255 /**
4256  * decodes a macroblock
4257  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4258  */
4259 static int decode_mb_cavlc(H264Context *h){
4260     MpegEncContext * const s = &h->s;
4261     int mb_xy;
4262     int partition_count;
4263     unsigned int mb_type, cbp;
4264     int dct8x8_allowed= h->pps.transform_8x8_mode;
4265
4266     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4267
4268     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4269
4270     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4271     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4272                 down the code */
4273     if(h->slice_type_nos != FF_I_TYPE){
4274         if(s->mb_skip_run==-1)
4275             s->mb_skip_run= get_ue_golomb(&s->gb);
4276
4277         if (s->mb_skip_run--) {
4278             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4279                 if(s->mb_skip_run==0)
4280                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4281                 else
4282                     predict_field_decoding_flag(h);
4283             }
4284             decode_mb_skip(h);
4285             return 0;
4286         }
4287     }
4288     if(FRAME_MBAFF){
4289         if( (s->mb_y&1) == 0 )
4290             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4291     }
4292
4293     h->prev_mb_skipped= 0;
4294
4295     mb_type= get_ue_golomb(&s->gb);
4296     if(h->slice_type_nos == FF_B_TYPE){
4297         if(mb_type < 23){
4298             partition_count= b_mb_type_info[mb_type].partition_count;
4299             mb_type=         b_mb_type_info[mb_type].type;
4300         }else{
4301             mb_type -= 23;
4302             goto decode_intra_mb;
4303         }
4304     }else if(h->slice_type_nos == FF_P_TYPE){
4305         if(mb_type < 5){
4306             partition_count= p_mb_type_info[mb_type].partition_count;
4307             mb_type=         p_mb_type_info[mb_type].type;
4308         }else{
4309             mb_type -= 5;
4310             goto decode_intra_mb;
4311         }
4312     }else{
4313        assert(h->slice_type_nos == FF_I_TYPE);
4314         if(h->slice_type == FF_SI_TYPE && mb_type)
4315             mb_type--;
4316 decode_intra_mb:
4317         if(mb_type > 25){
4318             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4319             return -1;
4320         }
4321         partition_count=0;
4322         cbp= i_mb_type_info[mb_type].cbp;
4323         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4324         mb_type= i_mb_type_info[mb_type].type;
4325     }
4326
4327     if(MB_FIELD)
4328         mb_type |= MB_TYPE_INTERLACED;
4329
4330     h->slice_table[ mb_xy ]= h->slice_num;
4331
4332     if(IS_INTRA_PCM(mb_type)){
4333         unsigned int x;
4334
4335         // We assume these blocks are very rare so we do not optimize it.
4336         align_get_bits(&s->gb);
4337
4338         // The pixels are stored in the same order as levels in h->mb array.
4339         for(x=0; x < (CHROMA ? 384 : 256); x++){
4340             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4341         }
4342
4343         // In deblocking, the quantizer is 0
4344         s->current_picture.qscale_table[mb_xy]= 0;
4345         // All coeffs are present
4346         memset(h->non_zero_count[mb_xy], 16, 16);
4347
4348         s->current_picture.mb_type[mb_xy]= mb_type;
4349         return 0;
4350     }
4351
4352     if(MB_MBAFF){
4353         h->ref_count[0] <<= 1;
4354         h->ref_count[1] <<= 1;
4355     }
4356
4357     fill_caches(h, mb_type, 0);
4358
4359     //mb_pred
4360     if(IS_INTRA(mb_type)){
4361         int pred_mode;
4362 //            init_top_left_availability(h);
4363         if(IS_INTRA4x4(mb_type)){
4364             int i;
4365             int di = 1;
4366             if(dct8x8_allowed && get_bits1(&s->gb)){
4367                 mb_type |= MB_TYPE_8x8DCT;
4368                 di = 4;
4369             }
4370
4371 //                fill_intra4x4_pred_table(h);
4372             for(i=0; i<16; i+=di){
4373                 int mode= pred_intra_mode(h, i);
4374
4375                 if(!get_bits1(&s->gb)){
4376                     const int rem_mode= get_bits(&s->gb, 3);
4377                     mode = rem_mode + (rem_mode >= mode);
4378                 }
4379
4380                 if(di==4)
4381                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4382                 else
4383                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4384             }
4385             write_back_intra_pred_mode(h);
4386             if( check_intra4x4_pred_mode(h) < 0)
4387                 return -1;
4388         }else{
4389             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4390             if(h->intra16x16_pred_mode < 0)
4391                 return -1;
4392         }
4393         if(CHROMA){
4394             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4395             if(pred_mode < 0)
4396                 return -1;
4397             h->chroma_pred_mode= pred_mode;
4398         }
4399     }else if(partition_count==4){
4400         int i, j, sub_partition_count[4], list, ref[2][4];
4401
4402         if(h->slice_type_nos == FF_B_TYPE){
4403             for(i=0; i<4; i++){
4404                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4405                 if(h->sub_mb_type[i] >=13){
4406                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4407                     return -1;
4408                 }
4409                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4410                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4411             }
4412             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4413                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4414                 pred_direct_motion(h, &mb_type);
4415                 h->ref_cache[0][scan8[4]] =
4416                 h->ref_cache[1][scan8[4]] =
4417                 h->ref_cache[0][scan8[12]] =
4418                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4419             }
4420         }else{
4421             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4422             for(i=0; i<4; i++){
4423                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4424                 if(h->sub_mb_type[i] >=4){
4425                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4426                     return -1;
4427                 }
4428                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4429                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4430             }
4431         }
4432
4433         for(list=0; list<h->list_count; list++){
4434             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4435             for(i=0; i<4; i++){
4436                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4437                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4438                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4439                     if(tmp>=ref_count){
4440                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4441                         return -1;
4442                     }
4443                     ref[list][i]= tmp;
4444                 }else{
4445                  //FIXME
4446                     ref[list][i] = -1;
4447                 }
4448             }
4449         }
4450
4451         if(dct8x8_allowed)
4452             dct8x8_allowed = get_dct8x8_allowed(h);
4453
4454         for(list=0; list<h->list_count; list++){
4455             for(i=0; i<4; i++){
4456                 if(IS_DIRECT(h->sub_mb_type[i])) {
4457                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4458                     continue;
4459                 }
4460                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4461                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4462
4463                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4464                     const int sub_mb_type= h->sub_mb_type[i];
4465                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4466                     for(j=0; j<sub_partition_count[i]; j++){
4467                         int mx, my;
4468                         const int index= 4*i + block_width*j;
4469                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4470                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4471                         mx += get_se_golomb(&s->gb);
4472                         my += get_se_golomb(&s->gb);
4473                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4474
4475                         if(IS_SUB_8X8(sub_mb_type)){
4476                             mv_cache[ 1 ][0]=
4477                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4478                             mv_cache[ 1 ][1]=
4479                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4480                         }else if(IS_SUB_8X4(sub_mb_type)){
4481                             mv_cache[ 1 ][0]= mx;
4482                             mv_cache[ 1 ][1]= my;
4483                         }else if(IS_SUB_4X8(sub_mb_type)){
4484                             mv_cache[ 8 ][0]= mx;
4485                             mv_cache[ 8 ][1]= my;
4486                         }
4487                         mv_cache[ 0 ][0]= mx;
4488                         mv_cache[ 0 ][1]= my;
4489                     }
4490                 }else{
4491                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4492                     p[0] = p[1]=
4493                     p[8] = p[9]= 0;
4494                 }
4495             }
4496         }
4497     }else if(IS_DIRECT(mb_type)){
4498         pred_direct_motion(h, &mb_type);
4499         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4500     }else{
4501         int list, mx, my, i;
4502          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4503         if(IS_16X16(mb_type)){
4504             for(list=0; list<h->list_count; list++){
4505                     unsigned int val;
4506                     if(IS_DIR(mb_type, 0, list)){
4507                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4508                         if(val >= h->ref_count[list]){
4509                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4510                             return -1;
4511                         }
4512                     }else
4513                         val= LIST_NOT_USED&0xFF;
4514                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4515             }
4516             for(list=0; list<h->list_count; list++){
4517                 unsigned int val;
4518                 if(IS_DIR(mb_type, 0, list)){
4519                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4520                     mx += get_se_golomb(&s->gb);
4521                     my += get_se_golomb(&s->gb);
4522                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4523
4524                     val= pack16to32(mx,my);
4525                 }else
4526                     val=0;
4527                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4528             }
4529         }
4530         else if(IS_16X8(mb_type)){
4531             for(list=0; list<h->list_count; list++){
4532                     for(i=0; i<2; i++){
4533                         unsigned int val;
4534                         if(IS_DIR(mb_type, i, list)){
4535                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4536                             if(val >= h->ref_count[list]){
4537                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4538                                 return -1;
4539                             }
4540                         }else
4541                             val= LIST_NOT_USED&0xFF;
4542                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4543                     }
4544             }
4545             for(list=0; list<h->list_count; list++){
4546                 for(i=0; i<2; i++){
4547                     unsigned int val;
4548                     if(IS_DIR(mb_type, i, list)){
4549                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4550                         mx += get_se_golomb(&s->gb);
4551                         my += get_se_golomb(&s->gb);
4552                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4553
4554                         val= pack16to32(mx,my);
4555                     }else
4556                         val=0;
4557                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4558                 }
4559             }
4560         }else{
4561             assert(IS_8X16(mb_type));
4562             for(list=0; list<h->list_count; list++){
4563                     for(i=0; i<2; i++){
4564                         unsigned int val;
4565                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4566                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4567                             if(val >= h->ref_count[list]){
4568                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4569                                 return -1;
4570                             }
4571                         }else
4572                             val= LIST_NOT_USED&0xFF;
4573                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4574                     }
4575             }
4576             for(list=0; list<h->list_count; list++){
4577                 for(i=0; i<2; i++){
4578                     unsigned int val;
4579                     if(IS_DIR(mb_type, i, list)){
4580                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4581                         mx += get_se_golomb(&s->gb);
4582                         my += get_se_golomb(&s->gb);
4583                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4584
4585                         val= pack16to32(mx,my);
4586                     }else
4587                         val=0;
4588                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4589                 }
4590             }
4591         }
4592     }
4593
4594     if(IS_INTER(mb_type))
4595         write_back_motion(h, mb_type);
4596
4597     if(!IS_INTRA16x16(mb_type)){
4598         cbp= get_ue_golomb(&s->gb);
4599         if(cbp > 47){
4600             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4601             return -1;
4602         }
4603
4604         if(CHROMA){
4605             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4606             else                     cbp= golomb_to_inter_cbp   [cbp];
4607         }else{
4608             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4609             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4610         }
4611     }
4612     h->cbp = cbp;
4613
4614     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4615         if(get_bits1(&s->gb)){
4616             mb_type |= MB_TYPE_8x8DCT;
4617             h->cbp_table[mb_xy]= cbp;
4618         }
4619     }
4620     s->current_picture.mb_type[mb_xy]= mb_type;
4621
4622     if(cbp || IS_INTRA16x16(mb_type)){
4623         int i8x8, i4x4, chroma_idx;
4624         int dquant;
4625         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4626         const uint8_t *scan, *scan8x8, *dc_scan;
4627
4628 //        fill_non_zero_count_cache(h);
4629
4630         if(IS_INTERLACED(mb_type)){
4631             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4632             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4633             dc_scan= luma_dc_field_scan;
4634         }else{
4635             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4636             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4637             dc_scan= luma_dc_zigzag_scan;
4638         }
4639
4640         dquant= get_se_golomb(&s->gb);
4641
4642         if( dquant > 25 || dquant < -26 ){
4643             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4644             return -1;
4645         }
4646
4647         s->qscale += dquant;
4648         if(((unsigned)s->qscale) > 51){
4649             if(s->qscale<0) s->qscale+= 52;
4650             else            s->qscale-= 52;
4651         }
4652
4653         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4654         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4655         if(IS_INTRA16x16(mb_type)){
4656             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4657                 return -1; //FIXME continue if partitioned and other return -1 too
4658             }
4659
4660             assert((cbp&15) == 0 || (cbp&15) == 15);
4661
4662             if(cbp&15){
4663                 for(i8x8=0; i8x8<4; i8x8++){
4664                     for(i4x4=0; i4x4<4; i4x4++){
4665                         const int index= i4x4 + 4*i8x8;
4666                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4667                             return -1;
4668                         }
4669                     }
4670                 }
4671             }else{
4672                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4673             }
4674         }else{
4675             for(i8x8=0; i8x8<4; i8x8++){
4676                 if(cbp & (1<<i8x8)){
4677                     if(IS_8x8DCT(mb_type)){
4678                         DCTELEM *buf = &h->mb[64*i8x8];
4679                         uint8_t *nnz;
4680                         for(i4x4=0; i4x4<4; i4x4++){
4681                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4682                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4683                                 return -1;
4684                         }
4685                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4686                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4687                     }else{
4688                         for(i4x4=0; i4x4<4; i4x4++){
4689                             const int index= i4x4 + 4*i8x8;
4690
4691                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4692                                 return -1;
4693                             }
4694                         }
4695                     }
4696                 }else{
4697                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4698                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4699                 }
4700             }
4701         }
4702
4703         if(cbp&0x30){
4704             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4705                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4706                     return -1;
4707                 }
4708         }
4709
4710         if(cbp&0x20){
4711             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4712                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4713                 for(i4x4=0; i4x4<4; i4x4++){
4714                     const int index= 16 + 4*chroma_idx + i4x4;
4715                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4716                         return -1;
4717                     }
4718                 }
4719             }
4720         }else{
4721             uint8_t * const nnz= &h->non_zero_count_cache[0];
4722             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4723             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4724         }
4725     }else{
4726         uint8_t * const nnz= &h->non_zero_count_cache[0];
4727         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4728         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4729         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4730     }
4731     s->current_picture.qscale_table[mb_xy]= s->qscale;
4732     write_back_non_zero_count(h);
4733
4734     if(MB_MBAFF){
4735         h->ref_count[0] >>= 1;
4736         h->ref_count[1] >>= 1;
4737     }
4738
4739     return 0;
4740 }
4741
4742 static int decode_cabac_field_decoding_flag(H264Context *h) {
4743     MpegEncContext * const s = &h->s;
4744     const int mb_x = s->mb_x;
4745     const int mb_y = s->mb_y & ~1;
4746     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4747     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4748
4749     unsigned int ctx = 0;
4750
4751     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4752         ctx += 1;
4753     }
4754     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4755         ctx += 1;
4756     }
4757
4758     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4759 }
4760
4761 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4762     uint8_t *state= &h->cabac_state[ctx_base];
4763     int mb_type;
4764
4765     if(intra_slice){
4766         MpegEncContext * const s = &h->s;
4767         const int mba_xy = h->left_mb_xy[0];
4768         const int mbb_xy = h->top_mb_xy;
4769         int ctx=0;
4770         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4771             ctx++;
4772         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4773             ctx++;
4774         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4775             return 0;   /* I4x4 */
4776         state += 2;
4777     }else{
4778         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4779             return 0;   /* I4x4 */
4780     }
4781
4782     if( get_cabac_terminate( &h->cabac ) )
4783         return 25;  /* PCM */
4784
4785     mb_type = 1; /* I16x16 */
4786     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4787     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4788         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4789     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4790     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4791     return mb_type;
4792 }
4793
4794 static int decode_cabac_mb_type( H264Context *h ) {
4795     MpegEncContext * const s = &h->s;
4796
4797     if( h->slice_type_nos == FF_I_TYPE ) {
4798         return decode_cabac_intra_mb_type(h, 3, 1);
4799     } else if( h->slice_type_nos == FF_P_TYPE ) {
4800         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4801             /* P-type */
4802             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4803                 /* P_L0_D16x16, P_8x8 */
4804                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4805             } else {
4806                 /* P_L0_D8x16, P_L0_D16x8 */
4807                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4808             }
4809         } else {
4810             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4811         }
4812     } else {
4813         const int mba_xy = h->left_mb_xy[0];
4814         const int mbb_xy = h->top_mb_xy;
4815         int ctx = 0;
4816         int bits;
4817         assert(h->slice_type_nos == FF_B_TYPE);
4818
4819         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4820             ctx++;
4821         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4822             ctx++;
4823
4824         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4825             return 0; /* B_Direct_16x16 */
4826
4827         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4828             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4829         }
4830
4831         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4832         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4833         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4834         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4835         if( bits < 8 )
4836             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4837         else if( bits == 13 ) {
4838             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4839         } else if( bits == 14 )
4840             return 11; /* B_L1_L0_8x16 */
4841         else if( bits == 15 )
4842             return 22; /* B_8x8 */
4843
4844         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4845         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4846     }
4847 }
4848
4849 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4850     MpegEncContext * const s = &h->s;
4851     int mba_xy, mbb_xy;
4852     int ctx = 0;
4853
4854     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4855         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4856         mba_xy = mb_xy - 1;
4857         if( (mb_y&1)
4858             && h->slice_table[mba_xy] == h->slice_num
4859             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4860             mba_xy += s->mb_stride;
4861         if( MB_FIELD ){
4862             mbb_xy = mb_xy - s->mb_stride;
4863             if( !(mb_y&1)
4864                 && h->slice_table[mbb_xy] == h->slice_num
4865                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4866                 mbb_xy -= s->mb_stride;
4867         }else
4868             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4869     }else{
4870         int mb_xy = h->mb_xy;
4871         mba_xy = mb_xy - 1;
4872         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4873     }
4874
4875     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4876         ctx++;
4877     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4878         ctx++;
4879
4880     if( h->slice_type_nos == FF_B_TYPE )
4881         ctx += 13;
4882     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4883 }
4884
4885 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4886     int mode = 0;
4887
4888     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4889         return pred_mode;
4890
4891     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4892     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4893     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4894
4895     if( mode >= pred_mode )
4896         return mode + 1;
4897     else
4898         return mode;
4899 }
4900
4901 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4902     const int mba_xy = h->left_mb_xy[0];
4903     const int mbb_xy = h->top_mb_xy;
4904
4905     int ctx = 0;
4906
4907     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4908     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4909         ctx++;
4910
4911     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4912         ctx++;
4913
4914     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4915         return 0;
4916
4917     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4918         return 1;
4919     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4920         return 2;
4921     else
4922         return 3;
4923 }
4924
4925 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4926     int cbp_b, cbp_a, ctx, cbp = 0;
4927
4928     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4929     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4930
4931     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4932     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4933     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4934     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4935     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4936     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4937     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4938     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4939     return cbp;
4940 }
4941 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4942     int ctx;
4943     int cbp_a, cbp_b;
4944
4945     cbp_a = (h->left_cbp>>4)&0x03;
4946     cbp_b = (h-> top_cbp>>4)&0x03;
4947
4948     ctx = 0;
4949     if( cbp_a > 0 ) ctx++;
4950     if( cbp_b > 0 ) ctx += 2;
4951     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4952         return 0;
4953
4954     ctx = 4;
4955     if( cbp_a == 2 ) ctx++;
4956     if( cbp_b == 2 ) ctx += 2;
4957     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4958 }
4959 static int decode_cabac_mb_dqp( H264Context *h) {
4960     int   ctx= h->last_qscale_diff != 0;
4961     int   val = 0;
4962
4963     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4964         ctx= 2+(ctx>>1);
4965         val++;
4966         if(val > 102) //prevent infinite loop
4967             return INT_MIN;
4968     }
4969
4970     if( val&0x01 )
4971         return   (val + 1)>>1 ;
4972     else
4973         return -((val + 1)>>1);
4974 }
4975 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4976     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4977         return 0;   /* 8x8 */
4978     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4979         return 1;   /* 8x4 */
4980     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4981         return 2;   /* 4x8 */
4982     return 3;       /* 4x4 */
4983 }
4984 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4985     int type;
4986     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4987         return 0;   /* B_Direct_8x8 */
4988     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4989         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4990     type = 3;
4991     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4992         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4993             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4994         type += 4;
4995     }
4996     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4997     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4998     return type;
4999 }
5000
5001 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5002     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5003 }
5004
5005 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5006     int refa = h->ref_cache[list][scan8[n] - 1];
5007     int refb = h->ref_cache[list][scan8[n] - 8];
5008     int ref  = 0;
5009     int ctx  = 0;
5010
5011     if( h->slice_type_nos == FF_B_TYPE) {
5012         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5013             ctx++;
5014         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5015             ctx += 2;
5016     } else {
5017         if( refa > 0 )
5018             ctx++;
5019         if( refb > 0 )
5020             ctx += 2;
5021     }
5022
5023     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5024         ref++;
5025         if( ctx < 4 )
5026             ctx = 4;
5027         else
5028             ctx = 5;
5029         if(ref >= 32 /*h->ref_list[list]*/){
5030             return -1;
5031         }
5032     }
5033     return ref;
5034 }
5035
5036 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5037     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5038                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5039     int ctxbase = (l == 0) ? 40 : 47;
5040     int mvd;
5041     int ctx = (amvd>2) + (amvd>32);
5042
5043     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5044         return 0;
5045
5046     mvd= 1;
5047     ctx= 3;
5048     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5049         mvd++;
5050         if( ctx < 6 )
5051             ctx++;
5052     }
5053
5054     if( mvd >= 9 ) {
5055         int k = 3;
5056         while( get_cabac_bypass( &h->cabac ) ) {
5057             mvd += 1 << k;
5058             k++;
5059             if(k>24){
5060                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5061                 return INT_MIN;
5062             }
5063         }
5064         while( k-- ) {
5065             if( get_cabac_bypass( &h->cabac ) )
5066                 mvd += 1 << k;
5067         }
5068     }
5069     return get_cabac_bypass_sign( &h->cabac, -mvd );
5070 }
5071
5072 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5073     int nza, nzb;
5074     int ctx = 0;
5075
5076     if( is_dc ) {
5077         if( cat == 0 ) {
5078             nza = h->left_cbp&0x100;
5079             nzb = h-> top_cbp&0x100;
5080         } else {
5081             nza = (h->left_cbp>>(6+idx))&0x01;
5082             nzb = (h-> top_cbp>>(6+idx))&0x01;
5083         }
5084     } else {
5085         assert(cat == 1 || cat == 2 || cat == 4);
5086         nza = h->non_zero_count_cache[scan8[idx] - 1];
5087         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5088     }
5089
5090     if( nza > 0 )
5091         ctx++;
5092
5093     if( nzb > 0 )
5094         ctx += 2;
5095
5096     return ctx + 4 * cat;
5097 }
5098
5099 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5100     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5101     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5102     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5103     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5104 };
5105
5106 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5107     static const int significant_coeff_flag_offset[2][6] = {
5108       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5109       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5110     };
5111     static const int last_coeff_flag_offset[2][6] = {
5112       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5113       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5114     };
5115     static const int coeff_abs_level_m1_offset[6] = {
5116         227+0, 227+10, 227+20, 227+30, 227+39, 426
5117     };
5118     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5119       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5120         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5121         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5122        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5123       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5124         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5125         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5126         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5127     };
5128     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5129      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5130      * map node ctx => cabac ctx for level=1 */
5131     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5132     /* map node ctx => cabac ctx for level>1 */
5133     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5134     static const uint8_t coeff_abs_level_transition[2][8] = {
5135     /* update node ctx after decoding a level=1 */
5136         { 1, 2, 3, 3, 4, 5, 6, 7 },
5137     /* update node ctx after decoding a level>1 */
5138         { 4, 4, 4, 4, 5, 6, 7, 7 }
5139     };
5140
5141     int index[64];
5142
5143     int av_unused last;
5144     int coeff_count = 0;
5145     int node_ctx = 0;
5146
5147     uint8_t *significant_coeff_ctx_base;
5148     uint8_t *last_coeff_ctx_base;
5149     uint8_t *abs_level_m1_ctx_base;
5150
5151 #ifndef ARCH_X86
5152 #define CABAC_ON_STACK
5153 #endif
5154 #ifdef CABAC_ON_STACK
5155 #define CC &cc
5156     CABACContext cc;
5157     cc.range     = h->cabac.range;
5158     cc.low       = h->cabac.low;
5159     cc.bytestream= h->cabac.bytestream;
5160 #else
5161 #define CC &h->cabac
5162 #endif
5163
5164
5165     /* cat: 0-> DC 16x16  n = 0
5166      *      1-> AC 16x16  n = luma4x4idx
5167      *      2-> Luma4x4   n = luma4x4idx
5168      *      3-> DC Chroma n = iCbCr
5169      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5170      *      5-> Luma8x8   n = 4 * luma8x8idx
5171      */
5172
5173     /* read coded block flag */
5174     if( is_dc || cat != 5 ) {
5175         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5176             if( !is_dc )
5177                 h->non_zero_count_cache[scan8[n]] = 0;
5178
5179 #ifdef CABAC_ON_STACK
5180             h->cabac.range     = cc.range     ;
5181             h->cabac.low       = cc.low       ;
5182             h->cabac.bytestream= cc.bytestream;
5183 #endif
5184             return;
5185         }
5186     }
5187
5188     significant_coeff_ctx_base = h->cabac_state
5189         + significant_coeff_flag_offset[MB_FIELD][cat];
5190     last_coeff_ctx_base = h->cabac_state
5191         + last_coeff_flag_offset[MB_FIELD][cat];
5192     abs_level_m1_ctx_base = h->cabac_state
5193         + coeff_abs_level_m1_offset[cat];
5194
5195     if( !is_dc && cat == 5 ) {
5196 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5197         for(last= 0; last < coefs; last++) { \
5198             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5199             if( get_cabac( CC, sig_ctx )) { \
5200                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5201                 index[coeff_count++] = last; \
5202                 if( get_cabac( CC, last_ctx ) ) { \
5203                     last= max_coeff; \
5204                     break; \
5205                 } \
5206             } \
5207         }\
5208         if( last == max_coeff -1 ) {\
5209             index[coeff_count++] = last;\
5210         }
5211         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5212 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5213         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5214     } else {
5215         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5216 #else
5217         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5218     } else {
5219         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5220 #endif
5221     }
5222     assert(coeff_count > 0);
5223
5224     if( is_dc ) {
5225         if( cat == 0 )
5226             h->cbp_table[h->mb_xy] |= 0x100;
5227         else
5228             h->cbp_table[h->mb_xy] |= 0x40 << n;
5229     } else {
5230         if( cat == 5 )
5231             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5232         else {
5233             assert( cat == 1 || cat == 2 || cat == 4 );
5234             h->non_zero_count_cache[scan8[n]] = coeff_count;
5235         }
5236     }
5237
5238     do {
5239         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5240
5241         int j= scantable[index[--coeff_count]];
5242
5243         if( get_cabac( CC, ctx ) == 0 ) {
5244             node_ctx = coeff_abs_level_transition[0][node_ctx];
5245             if( is_dc ) {
5246                 block[j] = get_cabac_bypass_sign( CC, -1);
5247             }else{
5248                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5249             }
5250         } else {
5251             int coeff_abs = 2;
5252             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5253             node_ctx = coeff_abs_level_transition[1][node_ctx];
5254
5255             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5256                 coeff_abs++;
5257             }
5258
5259             if( coeff_abs >= 15 ) {
5260                 int j = 0;
5261                 while( get_cabac_bypass( CC ) ) {
5262                     j++;
5263                 }
5264
5265                 coeff_abs=1;
5266                 while( j-- ) {
5267                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5268                 }
5269                 coeff_abs+= 14;
5270             }
5271
5272             if( is_dc ) {
5273                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5274             }else{
5275                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5276             }
5277         }
5278     } while( coeff_count );
5279 #ifdef CABAC_ON_STACK
5280             h->cabac.range     = cc.range     ;
5281             h->cabac.low       = cc.low       ;
5282             h->cabac.bytestream= cc.bytestream;
5283 #endif
5284
5285 }
5286
5287 #ifndef CONFIG_SMALL
5288 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5289     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5290 }
5291
5292 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5293     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5294 }
5295 #endif
5296
5297 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5298 #ifdef CONFIG_SMALL
5299     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5300 #else
5301     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5302     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5303 #endif
5304 }
5305
5306 static inline void compute_mb_neighbors(H264Context *h)
5307 {
5308     MpegEncContext * const s = &h->s;
5309     const int mb_xy  = h->mb_xy;
5310     h->top_mb_xy     = mb_xy - s->mb_stride;
5311     h->left_mb_xy[0] = mb_xy - 1;
5312     if(FRAME_MBAFF){
5313         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5314         const int top_pair_xy      = pair_xy     - s->mb_stride;
5315         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5316         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5317         const int curr_mb_frame_flag = !MB_FIELD;
5318         const int bottom = (s->mb_y & 1);
5319         if (bottom
5320                 ? !curr_mb_frame_flag // bottom macroblock
5321                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5322                 ) {
5323             h->top_mb_xy -= s->mb_stride;
5324         }
5325         if (left_mb_frame_flag != curr_mb_frame_flag) {
5326             h->left_mb_xy[0] = pair_xy - 1;
5327         }
5328     } else if (FIELD_PICTURE) {
5329         h->top_mb_xy -= s->mb_stride;
5330     }
5331     return;
5332 }
5333
5334 /**
5335  * decodes a macroblock
5336  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5337  */
5338 static int decode_mb_cabac(H264Context *h) {
5339     MpegEncContext * const s = &h->s;
5340     int mb_xy;
5341     int mb_type, partition_count, cbp = 0;
5342     int dct8x8_allowed= h->pps.transform_8x8_mode;
5343
5344     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5345
5346     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5347
5348     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5349     if( h->slice_type_nos != FF_I_TYPE ) {
5350         int skip;
5351         /* a skipped mb needs the aff flag from the following mb */
5352         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5353             predict_field_decoding_flag(h);
5354         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5355             skip = h->next_mb_skipped;
5356         else
5357             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5358         /* read skip flags */
5359         if( skip ) {
5360             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5361                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5362                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5363                 if(h->next_mb_skipped)
5364                     predict_field_decoding_flag(h);
5365                 else
5366                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5367             }
5368
5369             decode_mb_skip(h);
5370
5371             h->cbp_table[mb_xy] = 0;
5372             h->chroma_pred_mode_table[mb_xy] = 0;
5373             h->last_qscale_diff = 0;
5374
5375             return 0;
5376
5377         }
5378     }
5379     if(FRAME_MBAFF){
5380         if( (s->mb_y&1) == 0 )
5381             h->mb_mbaff =
5382             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5383     }
5384
5385     h->prev_mb_skipped = 0;
5386
5387     compute_mb_neighbors(h);
5388     mb_type = decode_cabac_mb_type( h );
5389     assert(mb_type >= 0);
5390
5391     if( h->slice_type_nos == FF_B_TYPE ) {
5392         if( mb_type < 23 ){
5393             partition_count= b_mb_type_info[mb_type].partition_count;
5394             mb_type=         b_mb_type_info[mb_type].type;
5395         }else{
5396             mb_type -= 23;
5397             goto decode_intra_mb;
5398         }
5399     } else if( h->slice_type_nos == FF_P_TYPE ) {
5400         if( mb_type < 5) {
5401             partition_count= p_mb_type_info[mb_type].partition_count;
5402             mb_type=         p_mb_type_info[mb_type].type;
5403         } else {
5404             mb_type -= 5;
5405             goto decode_intra_mb;
5406         }
5407     } else {
5408         if(h->slice_type == FF_SI_TYPE && mb_type)
5409             mb_type--;
5410         assert(h->slice_type_nos == FF_I_TYPE);
5411 decode_intra_mb:
5412         partition_count = 0;
5413         cbp= i_mb_type_info[mb_type].cbp;
5414         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5415         mb_type= i_mb_type_info[mb_type].type;
5416     }
5417     if(MB_FIELD)
5418         mb_type |= MB_TYPE_INTERLACED;
5419
5420     h->slice_table[ mb_xy ]= h->slice_num;
5421
5422     if(IS_INTRA_PCM(mb_type)) {
5423         const uint8_t *ptr;
5424
5425         // We assume these blocks are very rare so we do not optimize it.
5426         // FIXME The two following lines get the bitstream position in the cabac
5427         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5428         ptr= h->cabac.bytestream;
5429         if(h->cabac.low&0x1) ptr--;
5430         if(CABAC_BITS==16){
5431             if(h->cabac.low&0x1FF) ptr--;
5432         }
5433
5434         // The pixels are stored in the same order as levels in h->mb array.
5435         memcpy(h->mb, ptr, 256); ptr+=256;
5436         if(CHROMA){
5437             memcpy(h->mb+128, ptr, 128); ptr+=128;
5438         }
5439
5440         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5441
5442         // All blocks are present
5443         h->cbp_table[mb_xy] = 0x1ef;
5444         h->chroma_pred_mode_table[mb_xy] = 0;
5445         // In deblocking, the quantizer is 0
5446         s->current_picture.qscale_table[mb_xy]= 0;
5447         // All coeffs are present
5448         memset(h->non_zero_count[mb_xy], 16, 16);
5449         s->current_picture.mb_type[mb_xy]= mb_type;
5450         h->last_qscale_diff = 0;
5451         return 0;
5452     }
5453
5454     if(MB_MBAFF){
5455         h->ref_count[0] <<= 1;
5456         h->ref_count[1] <<= 1;
5457     }
5458
5459     fill_caches(h, mb_type, 0);
5460
5461     if( IS_INTRA( mb_type ) ) {
5462         int i, pred_mode;
5463         if( IS_INTRA4x4( mb_type ) ) {
5464             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5465                 mb_type |= MB_TYPE_8x8DCT;
5466                 for( i = 0; i < 16; i+=4 ) {
5467                     int pred = pred_intra_mode( h, i );
5468                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5469                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5470                 }
5471             } else {
5472                 for( i = 0; i < 16; i++ ) {
5473                     int pred = pred_intra_mode( h, i );
5474                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5475
5476                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5477                 }
5478             }
5479             write_back_intra_pred_mode(h);
5480             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5481         } else {
5482             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5483             if( h->intra16x16_pred_mode < 0 ) return -1;
5484         }
5485         if(CHROMA){
5486             h->chroma_pred_mode_table[mb_xy] =
5487             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5488
5489             pred_mode= check_intra_pred_mode( h, pred_mode );
5490             if( pred_mode < 0 ) return -1;
5491             h->chroma_pred_mode= pred_mode;
5492         }
5493     } else if( partition_count == 4 ) {
5494         int i, j, sub_partition_count[4], list, ref[2][4];
5495
5496         if( h->slice_type_nos == FF_B_TYPE ) {
5497             for( i = 0; i < 4; i++ ) {
5498                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5499                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5500                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5501             }
5502             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5503                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5504                 pred_direct_motion(h, &mb_type);
5505                 h->ref_cache[0][scan8[4]] =
5506                 h->ref_cache[1][scan8[4]] =
5507                 h->ref_cache[0][scan8[12]] =
5508                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5509                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5510                     for( i = 0; i < 4; i++ )
5511                         if( IS_DIRECT(h->sub_mb_type[i]) )
5512                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5513                 }
5514             }
5515         } else {
5516             for( i = 0; i < 4; i++ ) {
5517                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5518                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5519                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5520             }
5521         }
5522
5523         for( list = 0; list < h->list_count; list++ ) {
5524                 for( i = 0; i < 4; i++ ) {
5525                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5526                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5527                         if( h->ref_count[list] > 1 ){
5528                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5529                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5530                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5531                                 return -1;
5532                             }
5533                         }else
5534                             ref[list][i] = 0;
5535                     } else {
5536                         ref[list][i] = -1;
5537                     }
5538                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5539                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5540                 }
5541         }
5542
5543         if(dct8x8_allowed)
5544             dct8x8_allowed = get_dct8x8_allowed(h);
5545
5546         for(list=0; list<h->list_count; list++){
5547             for(i=0; i<4; i++){
5548                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5549                 if(IS_DIRECT(h->sub_mb_type[i])){
5550                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5551                     continue;
5552                 }
5553
5554                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5555                     const int sub_mb_type= h->sub_mb_type[i];
5556                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5557                     for(j=0; j<sub_partition_count[i]; j++){
5558                         int mpx, mpy;
5559                         int mx, my;
5560                         const int index= 4*i + block_width*j;
5561                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5562                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5563                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5564
5565                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5566                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5567                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5568
5569                         if(IS_SUB_8X8(sub_mb_type)){
5570                             mv_cache[ 1 ][0]=
5571                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5572                             mv_cache[ 1 ][1]=
5573                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5574
5575                             mvd_cache[ 1 ][0]=
5576                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5577                             mvd_cache[ 1 ][1]=
5578                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5579                         }else if(IS_SUB_8X4(sub_mb_type)){
5580                             mv_cache[ 1 ][0]= mx;
5581                             mv_cache[ 1 ][1]= my;
5582
5583                             mvd_cache[ 1 ][0]= mx - mpx;
5584                             mvd_cache[ 1 ][1]= my - mpy;
5585                         }else if(IS_SUB_4X8(sub_mb_type)){
5586                             mv_cache[ 8 ][0]= mx;
5587                             mv_cache[ 8 ][1]= my;
5588
5589                             mvd_cache[ 8 ][0]= mx - mpx;
5590                             mvd_cache[ 8 ][1]= my - mpy;
5591                         }
5592                         mv_cache[ 0 ][0]= mx;
5593                         mv_cache[ 0 ][1]= my;
5594
5595                         mvd_cache[ 0 ][0]= mx - mpx;
5596                         mvd_cache[ 0 ][1]= my - mpy;
5597                     }
5598                 }else{
5599                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5600                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5601                     p[0] = p[1] = p[8] = p[9] = 0;
5602                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5603                 }
5604             }
5605         }
5606     } else if( IS_DIRECT(mb_type) ) {
5607         pred_direct_motion(h, &mb_type);
5608         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5609         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5610         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5611     } else {
5612         int list, mx, my, i, mpx, mpy;
5613         if(IS_16X16(mb_type)){
5614             for(list=0; list<h->list_count; list++){
5615                 if(IS_DIR(mb_type, 0, list)){
5616                     int ref;
5617                     if(h->ref_count[list] > 1){
5618                         ref= decode_cabac_mb_ref(h, list, 0);
5619                         if(ref >= (unsigned)h->ref_count[list]){
5620                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5621                             return -1;
5622                         }
5623                     }else
5624                         ref=0;
5625                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5626                 }else
5627                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5628             }
5629             for(list=0; list<h->list_count; list++){
5630                 if(IS_DIR(mb_type, 0, list)){
5631                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5632
5633                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5634                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5635                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5636
5637                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5638                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5639                 }else
5640                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5641             }
5642         }
5643         else if(IS_16X8(mb_type)){
5644             for(list=0; list<h->list_count; list++){
5645                     for(i=0; i<2; i++){
5646                         if(IS_DIR(mb_type, i, list)){
5647                             int ref;
5648                             if(h->ref_count[list] > 1){
5649                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5650                                 if(ref >= (unsigned)h->ref_count[list]){
5651                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5652                                     return -1;
5653                                 }
5654                             }else
5655                                 ref=0;
5656                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5657                         }else
5658                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5659                     }
5660             }
5661             for(list=0; list<h->list_count; list++){
5662                 for(i=0; i<2; i++){
5663                     if(IS_DIR(mb_type, i, list)){
5664                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5665                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5666                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5667                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5668
5669                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5670                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5671                     }else{
5672                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5673                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5674                     }
5675                 }
5676             }
5677         }else{
5678             assert(IS_8X16(mb_type));
5679             for(list=0; list<h->list_count; list++){
5680                     for(i=0; i<2; i++){
5681                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5682                             int ref;
5683                             if(h->ref_count[list] > 1){
5684                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5685                                 if(ref >= (unsigned)h->ref_count[list]){
5686                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5687                                     return -1;
5688                                 }
5689                             }else
5690                                 ref=0;
5691                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5692                         }else
5693                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5694                     }
5695             }
5696             for(list=0; list<h->list_count; list++){
5697                 for(i=0; i<2; i++){
5698                     if(IS_DIR(mb_type, i, list)){
5699                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5700                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5701                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5702
5703                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5704                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5705                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5706                     }else{
5707                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5708                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5709                     }
5710                 }
5711             }
5712         }
5713     }
5714
5715    if( IS_INTER( mb_type ) ) {
5716         h->chroma_pred_mode_table[mb_xy] = 0;
5717         write_back_motion( h, mb_type );
5718    }
5719
5720     if( !IS_INTRA16x16( mb_type ) ) {
5721         cbp  = decode_cabac_mb_cbp_luma( h );
5722         if(CHROMA)
5723             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5724     }
5725
5726     h->cbp_table[mb_xy] = h->cbp = cbp;
5727
5728     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5729         if( decode_cabac_mb_transform_size( h ) )
5730             mb_type |= MB_TYPE_8x8DCT;
5731     }
5732     s->current_picture.mb_type[mb_xy]= mb_type;
5733
5734     if( cbp || IS_INTRA16x16( mb_type ) ) {
5735         const uint8_t *scan, *scan8x8, *dc_scan;
5736         const uint32_t *qmul;
5737         int dqp;
5738
5739         if(IS_INTERLACED(mb_type)){
5740             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5741             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5742             dc_scan= luma_dc_field_scan;
5743         }else{
5744             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5745             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5746             dc_scan= luma_dc_zigzag_scan;
5747         }
5748
5749         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5750         if( dqp == INT_MIN ){
5751             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5752             return -1;
5753         }
5754         s->qscale += dqp;
5755         if(((unsigned)s->qscale) > 51){
5756             if(s->qscale<0) s->qscale+= 52;
5757             else            s->qscale-= 52;
5758         }
5759         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5760         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5761
5762         if( IS_INTRA16x16( mb_type ) ) {
5763             int i;
5764             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5765             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5766
5767             if( cbp&15 ) {
5768                 qmul = h->dequant4_coeff[0][s->qscale];
5769                 for( i = 0; i < 16; i++ ) {
5770                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5771                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5772                 }
5773             } else {
5774                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5775             }
5776         } else {
5777             int i8x8, i4x4;
5778             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5779                 if( cbp & (1<<i8x8) ) {
5780                     if( IS_8x8DCT(mb_type) ) {
5781                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5782                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5783                     } else {
5784                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5785                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5786                             const int index = 4*i8x8 + i4x4;
5787                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5788 //START_TIMER
5789                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5790 //STOP_TIMER("decode_residual")
5791                         }
5792                     }
5793                 } else {
5794                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5795                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5796                 }
5797             }
5798         }
5799
5800         if( cbp&0x30 ){
5801             int c;
5802             for( c = 0; c < 2; c++ ) {
5803                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5804                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5805             }
5806         }
5807
5808         if( cbp&0x20 ) {
5809             int c, i;
5810             for( c = 0; c < 2; c++ ) {
5811                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5812                 for( i = 0; i < 4; i++ ) {
5813                     const int index = 16 + 4 * c + i;
5814                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5815                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5816                 }
5817             }
5818         } else {
5819             uint8_t * const nnz= &h->non_zero_count_cache[0];
5820             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5821             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5822         }
5823     } else {
5824         uint8_t * const nnz= &h->non_zero_count_cache[0];
5825         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5826         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5827         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5828         h->last_qscale_diff = 0;
5829     }
5830
5831     s->current_picture.qscale_table[mb_xy]= s->qscale;
5832     write_back_non_zero_count(h);
5833
5834     if(MB_MBAFF){
5835         h->ref_count[0] >>= 1;
5836         h->ref_count[1] >>= 1;
5837     }
5838
5839     return 0;
5840 }
5841
5842
5843 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5844     int i, d;
5845     const int index_a = qp + h->slice_alpha_c0_offset;
5846     const int alpha = (alpha_table+52)[index_a];
5847     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5848
5849     if( bS[0] < 4 ) {
5850         int8_t tc[4];
5851         tc[0] = (tc0_table+52)[index_a][bS[0]];
5852         tc[1] = (tc0_table+52)[index_a][bS[1]];
5853         tc[2] = (tc0_table+52)[index_a][bS[2]];
5854         tc[3] = (tc0_table+52)[index_a][bS[3]];
5855         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5856     } else {
5857         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5858     }
5859 }
5860 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5861     int i;
5862     const int index_a = qp + h->slice_alpha_c0_offset;
5863     const int alpha = (alpha_table+52)[index_a];
5864     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5865
5866     if( bS[0] < 4 ) {
5867         int8_t tc[4];
5868         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5869         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5870         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5871         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5872         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5873     } else {
5874         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5875     }
5876 }
5877
5878 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5879     int i;
5880     for( i = 0; i < 16; i++, pix += stride) {
5881         int index_a;
5882         int alpha;
5883         int beta;
5884
5885         int qp_index;
5886         int bS_index = (i >> 1);
5887         if (!MB_FIELD) {
5888             bS_index &= ~1;
5889             bS_index |= (i & 1);
5890         }
5891
5892         if( bS[bS_index] == 0 ) {
5893             continue;
5894         }
5895
5896         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5897         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5898         alpha = (alpha_table+52)[index_a];
5899         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5900
5901         if( bS[bS_index] < 4 ) {
5902             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5903             const int p0 = pix[-1];
5904             const int p1 = pix[-2];
5905             const int p2 = pix[-3];
5906             const int q0 = pix[0];
5907             const int q1 = pix[1];
5908             const int q2 = pix[2];
5909
5910             if( FFABS( p0 - q0 ) < alpha &&
5911                 FFABS( p1 - p0 ) < beta &&
5912                 FFABS( q1 - q0 ) < beta ) {
5913                 int tc = tc0;
5914                 int i_delta;
5915
5916                 if( FFABS( p2 - p0 ) < beta ) {
5917                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5918                     tc++;
5919                 }
5920                 if( FFABS( q2 - q0 ) < beta ) {
5921                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5922                     tc++;
5923                 }
5924
5925                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5926                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5927                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5928                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5929             }
5930         }else{
5931             const int p0 = pix[-1];
5932             const int p1 = pix[-2];
5933             const int p2 = pix[-3];
5934
5935             const int q0 = pix[0];
5936             const int q1 = pix[1];
5937             const int q2 = pix[2];
5938
5939             if( FFABS( p0 - q0 ) < alpha &&
5940                 FFABS( p1 - p0 ) < beta &&
5941                 FFABS( q1 - q0 ) < beta ) {
5942
5943                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5944                     if( FFABS( p2 - p0 ) < beta)
5945                     {
5946                         const int p3 = pix[-4];
5947                         /* p0', p1', p2' */
5948                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5949                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5950                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5951                     } else {
5952                         /* p0' */
5953                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5954                     }
5955                     if( FFABS( q2 - q0 ) < beta)
5956                     {
5957                         const int q3 = pix[3];
5958                         /* q0', q1', q2' */
5959                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5960                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5961                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5962                     } else {
5963                         /* q0' */
5964                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5965                     }
5966                 }else{
5967                     /* p0', q0' */
5968                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5969                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5970                 }
5971                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5972             }
5973         }
5974     }
5975 }
5976 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5977     int i;
5978     for( i = 0; i < 8; i++, pix += stride) {
5979         int index_a;
5980         int alpha;
5981         int beta;
5982
5983         int qp_index;
5984         int bS_index = i;
5985
5986         if( bS[bS_index] == 0 ) {
5987             continue;
5988         }
5989
5990         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5991         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5992         alpha = (alpha_table+52)[index_a];
5993         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5994
5995         if( bS[bS_index] < 4 ) {
5996             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
5997             const int p0 = pix[-1];
5998             const int p1 = pix[-2];
5999             const int q0 = pix[0];
6000             const int q1 = pix[1];
6001
6002             if( FFABS( p0 - q0 ) < alpha &&
6003                 FFABS( p1 - p0 ) < beta &&
6004                 FFABS( q1 - q0 ) < beta ) {
6005                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6006
6007                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6008                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6009                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6010             }
6011         }else{
6012             const int p0 = pix[-1];
6013             const int p1 = pix[-2];
6014             const int q0 = pix[0];
6015             const int q1 = pix[1];
6016
6017             if( FFABS( p0 - q0 ) < alpha &&
6018                 FFABS( p1 - p0 ) < beta &&
6019                 FFABS( q1 - q0 ) < beta ) {
6020
6021                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6022                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6023                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6024             }
6025         }
6026     }
6027 }
6028
6029 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6030     int i, d;
6031     const int index_a = qp + h->slice_alpha_c0_offset;
6032     const int alpha = (alpha_table+52)[index_a];
6033     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6034     const int pix_next  = stride;
6035
6036     if( bS[0] < 4 ) {
6037         int8_t tc[4];
6038         tc[0] = (tc0_table+52)[index_a][bS[0]];
6039         tc[1] = (tc0_table+52)[index_a][bS[1]];
6040         tc[2] = (tc0_table+52)[index_a][bS[2]];
6041         tc[3] = (tc0_table+52)[index_a][bS[3]];
6042         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6043     } else {
6044         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6045     }
6046 }
6047
6048 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6049     int i;
6050     const int index_a = qp + h->slice_alpha_c0_offset;
6051     const int alpha = (alpha_table+52)[index_a];
6052     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6053
6054     if( bS[0] < 4 ) {
6055         int8_t tc[4];
6056         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6057         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6058         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6059         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6060         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6061     } else {
6062         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6063     }
6064 }
6065
6066 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6067     MpegEncContext * const s = &h->s;
6068     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6069     int mb_xy, mb_type;
6070     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6071
6072     mb_xy = h->mb_xy;
6073
6074     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6075         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6076        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6077                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6078         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6079         return;
6080     }
6081     assert(!FRAME_MBAFF);
6082
6083     mb_type = s->current_picture.mb_type[mb_xy];
6084     qp = s->current_picture.qscale_table[mb_xy];
6085     qp0 = s->current_picture.qscale_table[mb_xy-1];
6086     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6087     qpc = get_chroma_qp( h, 0, qp );
6088     qpc0 = get_chroma_qp( h, 0, qp0 );
6089     qpc1 = get_chroma_qp( h, 0, qp1 );
6090     qp0 = (qp + qp0 + 1) >> 1;
6091     qp1 = (qp + qp1 + 1) >> 1;
6092     qpc0 = (qpc + qpc0 + 1) >> 1;
6093     qpc1 = (qpc + qpc1 + 1) >> 1;
6094     qp_thresh = 15 - h->slice_alpha_c0_offset;
6095     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6096        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6097         return;
6098
6099     if( IS_INTRA(mb_type) ) {
6100         int16_t bS4[4] = {4,4,4,4};
6101         int16_t bS3[4] = {3,3,3,3};
6102         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6103         if( IS_8x8DCT(mb_type) ) {
6104             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6105             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6106             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6107             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6108         } else {
6109             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6110             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6111             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6112             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6113             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6114             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6115             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6116             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6117         }
6118         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6119         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6120         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6121         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6122         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6123         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6124         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6125         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6126         return;
6127     } else {
6128         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6129         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6130         int edges;
6131         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6132             edges = 4;
6133             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6134         } else {
6135             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6136                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6137             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6138                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6139                              ? 3 : 0;
6140             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6141             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6142             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6143                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6144         }
6145         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6146             bSv[0][0] = 0x0004000400040004ULL;
6147         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6148             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6149
6150 #define FILTER(hv,dir,edge)\
6151         if(bSv[dir][edge]) {\
6152             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6153             if(!(edge&1)) {\
6154                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6155                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6156             }\
6157         }
6158         if( edges == 1 ) {
6159             FILTER(v,0,0);
6160             FILTER(h,1,0);
6161         } else if( IS_8x8DCT(mb_type) ) {
6162             FILTER(v,0,0);
6163             FILTER(v,0,2);
6164             FILTER(h,1,0);
6165             FILTER(h,1,2);
6166         } else {
6167             FILTER(v,0,0);
6168             FILTER(v,0,1);
6169             FILTER(v,0,2);
6170             FILTER(v,0,3);
6171             FILTER(h,1,0);
6172             FILTER(h,1,1);
6173             FILTER(h,1,2);
6174             FILTER(h,1,3);
6175         }
6176 #undef FILTER
6177     }
6178 }
6179
6180
6181 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6182     MpegEncContext * const s = &h->s;
6183     int edge;
6184     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6185     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6186     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6187     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6188     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6189
6190     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6191                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6192     // how often to recheck mv-based bS when iterating between edges
6193     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6194                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6195     // how often to recheck mv-based bS when iterating along each edge
6196     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6197
6198     if (first_vertical_edge_done) {
6199         start = 1;
6200     }
6201
6202     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6203         start = 1;
6204
6205     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6206         && !IS_INTERLACED(mb_type)
6207         && IS_INTERLACED(mbm_type)
6208         ) {
6209         // This is a special case in the norm where the filtering must
6210         // be done twice (one each of the field) even if we are in a
6211         // frame macroblock.
6212         //
6213         static const int nnz_idx[4] = {4,5,6,3};
6214         unsigned int tmp_linesize   = 2 *   linesize;
6215         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6216         int mbn_xy = mb_xy - 2 * s->mb_stride;
6217         int qp;
6218         int i, j;
6219         int16_t bS[4];
6220
6221         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6222             if( IS_INTRA(mb_type) ||
6223                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6224                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6225             } else {
6226                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6227                 for( i = 0; i < 4; i++ ) {
6228                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6229                         mbn_nnz[nnz_idx[i]] != 0 )
6230                         bS[i] = 2;
6231                     else
6232                         bS[i] = 1;
6233                 }
6234             }
6235             // Do not use s->qscale as luma quantizer because it has not the same
6236             // value in IPCM macroblocks.
6237             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6238             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6239             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6240             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6241             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6242                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6243             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6244                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6245         }
6246
6247         start = 1;
6248     }
6249
6250     /* Calculate bS */
6251     for( edge = start; edge < edges; edge++ ) {
6252         /* mbn_xy: neighbor macroblock */
6253         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6254         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6255         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6256         int16_t bS[4];
6257         int qp;
6258
6259         if( (edge&1) && IS_8x8DCT(mb_type) )
6260             continue;
6261
6262         if( IS_INTRA(mb_type) ||
6263             IS_INTRA(mbn_type) ) {
6264             int value;
6265             if (edge == 0) {
6266                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6267                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6268                 ) {
6269                     value = 4;
6270                 } else {
6271                     value = 3;
6272                 }
6273             } else {
6274                 value = 3;
6275             }
6276             bS[0] = bS[1] = bS[2] = bS[3] = value;
6277         } else {
6278             int i, l;
6279             int mv_done;
6280
6281             if( edge & mask_edge ) {
6282                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6283                 mv_done = 1;
6284             }
6285             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6286                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6287                 mv_done = 1;
6288             }
6289             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6290                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6291                 int bn_idx= b_idx - (dir ? 8:1);
6292                 int v = 0;
6293
6294                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6295                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6296                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6297                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6298                 }
6299
6300                 if(h->slice_type_nos == FF_B_TYPE && v){
6301                     v=0;
6302                     for( l = 0; !v && l < 2; l++ ) {
6303                         int ln= 1-l;
6304                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6305                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6306                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6307                     }
6308                 }
6309
6310                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6311                 mv_done = 1;
6312             }
6313             else
6314                 mv_done = 0;
6315
6316             for( i = 0; i < 4; i++ ) {
6317                 int x = dir == 0 ? edge : i;
6318                 int y = dir == 0 ? i    : edge;
6319                 int b_idx= 8 + 4 + x + 8*y;
6320                 int bn_idx= b_idx - (dir ? 8:1);
6321
6322                 if( h->non_zero_count_cache[b_idx] |
6323                     h->non_zero_count_cache[bn_idx] ) {
6324                     bS[i] = 2;
6325                 }
6326                 else if(!mv_done)
6327                 {
6328                     bS[i] = 0;
6329                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6330                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6331                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6332                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6333                             bS[i] = 1;
6334                             break;
6335                         }
6336                     }
6337
6338                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6339                         bS[i] = 0;
6340                         for( l = 0; l < 2; l++ ) {
6341                             int ln= 1-l;
6342                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6343                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6344                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6345                                 bS[i] = 1;
6346                                 break;
6347                             }
6348                         }
6349                     }
6350                 }
6351             }
6352
6353             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6354                 continue;
6355         }
6356
6357         /* Filter edge */
6358         // Do not use s->qscale as luma quantizer because it has not the same
6359         // value in IPCM macroblocks.
6360         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6361         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6362         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6363         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6364         if( dir == 0 ) {
6365             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6366             if( (edge&1) == 0 ) {
6367                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6368                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6369                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6370                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6371             }
6372         } else {
6373             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6374             if( (edge&1) == 0 ) {
6375                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6376                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6377                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6378                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6379             }
6380         }
6381     }
6382 }
6383
6384 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6385     MpegEncContext * const s = &h->s;
6386     const int mb_xy= mb_x + mb_y*s->mb_stride;
6387     const int mb_type = s->current_picture.mb_type[mb_xy];
6388     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6389     int first_vertical_edge_done = 0;
6390     int dir;
6391
6392     //for sufficiently low qp, filtering wouldn't do anything
6393     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6394     if(!FRAME_MBAFF){
6395         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6396         int qp = s->current_picture.qscale_table[mb_xy];
6397         if(qp <= qp_thresh
6398            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6399            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6400             return;
6401         }
6402     }
6403
6404     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6405     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6406         int top_type, left_type[2];
6407         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6408         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6409         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6410
6411         if(IS_8x8DCT(top_type)){
6412             h->non_zero_count_cache[4+8*0]=
6413             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6414             h->non_zero_count_cache[6+8*0]=
6415             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6416         }
6417         if(IS_8x8DCT(left_type[0])){
6418             h->non_zero_count_cache[3+8*1]=
6419             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6420         }
6421         if(IS_8x8DCT(left_type[1])){
6422             h->non_zero_count_cache[3+8*3]=
6423             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6424         }
6425
6426         if(IS_8x8DCT(mb_type)){
6427             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6428             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6429
6430             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6431             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6432
6433             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6434             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6435
6436             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6437             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6438         }
6439     }
6440
6441     if (FRAME_MBAFF
6442             // left mb is in picture
6443             && h->slice_table[mb_xy-1] != 0xFFFF
6444             // and current and left pair do not have the same interlaced type
6445             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6446             // and left mb is in the same slice if deblocking_filter == 2
6447             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6448         /* First vertical edge is different in MBAFF frames
6449          * There are 8 different bS to compute and 2 different Qp
6450          */
6451         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6452         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6453         int16_t bS[8];
6454         int qp[2];
6455         int bqp[2];
6456         int rqp[2];
6457         int mb_qp, mbn0_qp, mbn1_qp;
6458         int i;
6459         first_vertical_edge_done = 1;
6460
6461         if( IS_INTRA(mb_type) )
6462             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6463         else {
6464             for( i = 0; i < 8; i++ ) {
6465                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6466
6467                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6468                     bS[i] = 4;
6469                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6470                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6471                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6472                                                                        :
6473                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6474                     bS[i] = 2;
6475                 else
6476                     bS[i] = 1;
6477             }
6478         }
6479
6480         mb_qp = s->current_picture.qscale_table[mb_xy];
6481         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6482         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6483         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6484         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6485                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6486         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6487                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6488         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6489         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6490                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6491         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6492                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6493
6494         /* Filter edge */
6495         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6496         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6497         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6498         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6499         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6500     }
6501
6502 #ifdef CONFIG_SMALL
6503     for( dir = 0; dir < 2; dir++ )
6504         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6505 #else
6506     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6507     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6508 #endif
6509 }
6510
6511 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6512     H264Context *h = *(void**)arg;
6513     MpegEncContext * const s = &h->s;
6514     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6515
6516     s->mb_skip_run= -1;
6517
6518     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6519                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6520
6521     if( h->pps.cabac ) {
6522         int i;
6523
6524         /* realign */
6525         align_get_bits( &s->gb );
6526
6527         /* init cabac */
6528         ff_init_cabac_states( &h->cabac);
6529         ff_init_cabac_decoder( &h->cabac,
6530                                s->gb.buffer + get_bits_count(&s->gb)/8,
6531                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6532         /* calculate pre-state */
6533         for( i= 0; i < 460; i++ ) {
6534             int pre;
6535             if( h->slice_type_nos == FF_I_TYPE )
6536                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6537             else
6538                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6539
6540             if( pre <= 63 )
6541                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6542             else
6543                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6544         }
6545
6546         for(;;){
6547 //START_TIMER
6548             int ret = decode_mb_cabac(h);
6549             int eos;
6550 //STOP_TIMER("decode_mb_cabac")
6551
6552             if(ret>=0) hl_decode_mb(h);
6553
6554             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6555                 s->mb_y++;
6556
6557                 if(ret>=0) ret = decode_mb_cabac(h);
6558
6559                 if(ret>=0) hl_decode_mb(h);
6560                 s->mb_y--;
6561             }
6562             eos = get_cabac_terminate( &h->cabac );
6563
6564             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6565                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6566                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6567                 return -1;
6568             }
6569
6570             if( ++s->mb_x >= s->mb_width ) {
6571                 s->mb_x = 0;
6572                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6573                 ++s->mb_y;
6574                 if(FIELD_OR_MBAFF_PICTURE) {
6575                     ++s->mb_y;
6576                 }
6577             }
6578
6579             if( eos || s->mb_y >= s->mb_height ) {
6580                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6581                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6582                 return 0;
6583             }
6584         }
6585
6586     } else {
6587         for(;;){
6588             int ret = decode_mb_cavlc(h);
6589
6590             if(ret>=0) hl_decode_mb(h);
6591
6592             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6593                 s->mb_y++;
6594                 ret = decode_mb_cavlc(h);
6595
6596                 if(ret>=0) hl_decode_mb(h);
6597                 s->mb_y--;
6598             }
6599
6600             if(ret<0){
6601                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6602                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6603
6604                 return -1;
6605             }
6606
6607             if(++s->mb_x >= s->mb_width){
6608                 s->mb_x=0;
6609                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6610                 ++s->mb_y;
6611                 if(FIELD_OR_MBAFF_PICTURE) {
6612                     ++s->mb_y;
6613                 }
6614                 if(s->mb_y >= s->mb_height){
6615                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6616
6617                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6618                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6619
6620                         return 0;
6621                     }else{
6622                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6623
6624                         return -1;
6625                     }
6626                 }
6627             }
6628
6629             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6630                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6631                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6632                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6633
6634                     return 0;
6635                 }else{
6636                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6637
6638                     return -1;
6639                 }
6640             }
6641         }
6642     }
6643
6644 #if 0
6645     for(;s->mb_y < s->mb_height; s->mb_y++){
6646         for(;s->mb_x < s->mb_width; s->mb_x++){
6647             int ret= decode_mb(h);
6648
6649             hl_decode_mb(h);
6650
6651             if(ret<0){
6652                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6653                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6654
6655                 return -1;
6656             }
6657
6658             if(++s->mb_x >= s->mb_width){
6659                 s->mb_x=0;
6660                 if(++s->mb_y >= s->mb_height){
6661                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6662                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6663
6664                         return 0;
6665                     }else{
6666                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6667
6668                         return -1;
6669                     }
6670                 }
6671             }
6672
6673             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6674                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6675                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6676
6677                     return 0;
6678                 }else{
6679                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6680
6681                     return -1;
6682                 }
6683             }
6684         }
6685         s->mb_x=0;
6686         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6687     }
6688 #endif
6689     return -1; //not reached
6690 }
6691
6692 static int decode_picture_timing(H264Context *h){
6693     MpegEncContext * const s = &h->s;
6694     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6695         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6696         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6697     }
6698     if(h->sps.pic_struct_present_flag){
6699         unsigned int i, num_clock_ts;
6700         h->sei_pic_struct = get_bits(&s->gb, 4);
6701
6702         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6703             return -1;
6704
6705         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6706
6707         for (i = 0 ; i < num_clock_ts ; i++){
6708             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6709                 unsigned int full_timestamp_flag;
6710                 skip_bits(&s->gb, 2);                 /* ct_type */
6711                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6712                 skip_bits(&s->gb, 5);                 /* counting_type */
6713                 full_timestamp_flag = get_bits(&s->gb, 1);
6714                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6715                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6716                 skip_bits(&s->gb, 8);                 /* n_frames */
6717                 if(full_timestamp_flag){
6718                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6719                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6720                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6721                 }else{
6722                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6723                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6724                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6725                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6726                             if(get_bits(&s->gb, 1))   /* hours_flag */
6727                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6728                         }
6729                     }
6730                 }
6731                 if(h->sps.time_offset_length > 0)
6732                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6733             }
6734         }
6735     }
6736     return 0;
6737 }
6738
6739 static int decode_unregistered_user_data(H264Context *h, int size){
6740     MpegEncContext * const s = &h->s;
6741     uint8_t user_data[16+256];
6742     int e, build, i;
6743
6744     if(size<16)
6745         return -1;
6746
6747     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6748         user_data[i]= get_bits(&s->gb, 8);
6749     }
6750
6751     user_data[i]= 0;
6752     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6753     if(e==1 && build>=0)
6754         h->x264_build= build;
6755
6756     if(s->avctx->debug & FF_DEBUG_BUGS)
6757         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6758
6759     for(; i<size; i++)
6760         skip_bits(&s->gb, 8);
6761
6762     return 0;
6763 }
6764
6765 static int decode_sei(H264Context *h){
6766     MpegEncContext * const s = &h->s;
6767
6768     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6769         int size, type;
6770
6771         type=0;
6772         do{
6773             type+= show_bits(&s->gb, 8);
6774         }while(get_bits(&s->gb, 8) == 255);
6775
6776         size=0;
6777         do{
6778             size+= show_bits(&s->gb, 8);
6779         }while(get_bits(&s->gb, 8) == 255);
6780
6781         switch(type){
6782         case 1: // Picture timing SEI
6783             if(decode_picture_timing(h) < 0)
6784                 return -1;
6785             break;
6786         case 5:
6787             if(decode_unregistered_user_data(h, size) < 0)
6788                 return -1;
6789             break;
6790         default:
6791             skip_bits(&s->gb, 8*size);
6792         }
6793
6794         //FIXME check bits here
6795         align_get_bits(&s->gb);
6796     }
6797
6798     return 0;
6799 }
6800
6801 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6802     MpegEncContext * const s = &h->s;
6803     int cpb_count, i;
6804     cpb_count = get_ue_golomb(&s->gb) + 1;
6805
6806     if(cpb_count > 32U){
6807         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6808         return -1;
6809     }
6810
6811     get_bits(&s->gb, 4); /* bit_rate_scale */
6812     get_bits(&s->gb, 4); /* cpb_size_scale */
6813     for(i=0; i<cpb_count; i++){
6814         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6815         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6816         get_bits1(&s->gb);     /* cbr_flag */
6817     }
6818     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6819     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6820     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6821     sps->time_offset_length = get_bits(&s->gb, 5);
6822     return 0;
6823 }
6824
6825 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6826     MpegEncContext * const s = &h->s;
6827     int aspect_ratio_info_present_flag;
6828     unsigned int aspect_ratio_idc;
6829
6830     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6831
6832     if( aspect_ratio_info_present_flag ) {
6833         aspect_ratio_idc= get_bits(&s->gb, 8);
6834         if( aspect_ratio_idc == EXTENDED_SAR ) {
6835             sps->sar.num= get_bits(&s->gb, 16);
6836             sps->sar.den= get_bits(&s->gb, 16);
6837         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6838             sps->sar=  pixel_aspect[aspect_ratio_idc];
6839         }else{
6840             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6841             return -1;
6842         }
6843     }else{
6844         sps->sar.num=
6845         sps->sar.den= 0;
6846     }
6847 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6848
6849     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6850         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6851     }
6852
6853     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6854         get_bits(&s->gb, 3);    /* video_format */
6855         get_bits1(&s->gb);      /* video_full_range_flag */
6856         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6857             get_bits(&s->gb, 8); /* colour_primaries */
6858             get_bits(&s->gb, 8); /* transfer_characteristics */
6859             get_bits(&s->gb, 8); /* matrix_coefficients */
6860         }
6861     }
6862
6863     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6864         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6865         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6866     }
6867
6868     sps->timing_info_present_flag = get_bits1(&s->gb);
6869     if(sps->timing_info_present_flag){
6870         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6871         sps->time_scale = get_bits_long(&s->gb, 32);
6872         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6873     }
6874
6875     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6876     if(sps->nal_hrd_parameters_present_flag)
6877         if(decode_hrd_parameters(h, sps) < 0)
6878             return -1;
6879     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6880     if(sps->vcl_hrd_parameters_present_flag)
6881         if(decode_hrd_parameters(h, sps) < 0)
6882             return -1;
6883     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6884         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6885     sps->pic_struct_present_flag = get_bits1(&s->gb);
6886
6887     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6888     if(sps->bitstream_restriction_flag){
6889         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6890         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6891         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6892         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6893         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6894         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6895         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6896
6897         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6898             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6899             return -1;
6900         }
6901     }
6902
6903     return 0;
6904 }
6905
6906 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6907                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6908     MpegEncContext * const s = &h->s;
6909     int i, last = 8, next = 8;
6910     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6911     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6912         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6913     else
6914     for(i=0;i<size;i++){
6915         if(next)
6916             next = (last + get_se_golomb(&s->gb)) & 0xff;
6917         if(!i && !next){ /* matrix not written, we use the preset one */
6918             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6919             break;
6920         }
6921         last = factors[scan[i]] = next ? next : last;
6922     }
6923 }
6924
6925 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6926                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6927     MpegEncContext * const s = &h->s;
6928     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6929     const uint8_t *fallback[4] = {
6930         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6931         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6932         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6933         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6934     };
6935     if(get_bits1(&s->gb)){
6936         sps->scaling_matrix_present |= is_sps;
6937         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6938         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6939         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6940         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6941         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6942         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6943         if(is_sps || pps->transform_8x8_mode){
6944             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6945             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6946         }
6947     }
6948 }
6949
6950 static inline int decode_seq_parameter_set(H264Context *h){
6951     MpegEncContext * const s = &h->s;
6952     int profile_idc, level_idc;
6953     unsigned int sps_id;
6954     int i;
6955     SPS *sps;
6956
6957     profile_idc= get_bits(&s->gb, 8);
6958     get_bits1(&s->gb);   //constraint_set0_flag
6959     get_bits1(&s->gb);   //constraint_set1_flag
6960     get_bits1(&s->gb);   //constraint_set2_flag
6961     get_bits1(&s->gb);   //constraint_set3_flag
6962     get_bits(&s->gb, 4); // reserved
6963     level_idc= get_bits(&s->gb, 8);
6964     sps_id= get_ue_golomb(&s->gb);
6965
6966     if(sps_id >= MAX_SPS_COUNT) {
6967         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
6968         return -1;
6969     }
6970     sps= av_mallocz(sizeof(SPS));
6971     if(sps == NULL)
6972         return -1;
6973
6974     sps->profile_idc= profile_idc;
6975     sps->level_idc= level_idc;
6976
6977     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
6978     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
6979     sps->scaling_matrix_present = 0;
6980
6981     if(sps->profile_idc >= 100){ //high profile
6982         sps->chroma_format_idc= get_ue_golomb(&s->gb);
6983         if(sps->chroma_format_idc == 3)
6984             get_bits1(&s->gb);  //residual_color_transform_flag
6985         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6986         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6987         sps->transform_bypass = get_bits1(&s->gb);
6988         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6989     }else{
6990         sps->chroma_format_idc= 1;
6991     }
6992
6993     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6994     sps->poc_type= get_ue_golomb(&s->gb);
6995
6996     if(sps->poc_type == 0){ //FIXME #define
6997         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6998     } else if(sps->poc_type == 1){//FIXME #define
6999         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7000         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7001         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7002         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7003
7004         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7005             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7006             goto fail;
7007         }
7008
7009         for(i=0; i<sps->poc_cycle_length; i++)
7010             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7011     }else if(sps->poc_type != 2){
7012         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7013         goto fail;
7014     }
7015
7016     sps->ref_frame_count= get_ue_golomb(&s->gb);
7017     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7018         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7019         goto fail;
7020     }
7021     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7022     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7023     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7024     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7025        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7026         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7027         goto fail;
7028     }
7029
7030     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7031     if(!sps->frame_mbs_only_flag)
7032         sps->mb_aff= get_bits1(&s->gb);
7033     else
7034         sps->mb_aff= 0;
7035
7036     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7037
7038 #ifndef ALLOW_INTERLACE
7039     if(sps->mb_aff)
7040         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7041 #endif
7042     sps->crop= get_bits1(&s->gb);
7043     if(sps->crop){
7044         sps->crop_left  = get_ue_golomb(&s->gb);
7045         sps->crop_right = get_ue_golomb(&s->gb);
7046         sps->crop_top   = get_ue_golomb(&s->gb);
7047         sps->crop_bottom= get_ue_golomb(&s->gb);
7048         if(sps->crop_left || sps->crop_top){
7049             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7050         }
7051         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7052             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7053         }
7054     }else{
7055         sps->crop_left  =
7056         sps->crop_right =
7057         sps->crop_top   =
7058         sps->crop_bottom= 0;
7059     }
7060
7061     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7062     if( sps->vui_parameters_present_flag )
7063         decode_vui_parameters(h, sps);
7064
7065     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7066         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7067                sps_id, sps->profile_idc, sps->level_idc,
7068                sps->poc_type,
7069                sps->ref_frame_count,
7070                sps->mb_width, sps->mb_height,
7071                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7072                sps->direct_8x8_inference_flag ? "8B8" : "",
7073                sps->crop_left, sps->crop_right,
7074                sps->crop_top, sps->crop_bottom,
7075                sps->vui_parameters_present_flag ? "VUI" : "",
7076                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7077                );
7078     }
7079     av_free(h->sps_buffers[sps_id]);
7080     h->sps_buffers[sps_id]= sps;
7081     return 0;
7082 fail:
7083     av_free(sps);
7084     return -1;
7085 }
7086
7087 static void
7088 build_qp_table(PPS *pps, int t, int index)
7089 {
7090     int i;
7091     for(i = 0; i < 52; i++)
7092         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7093 }
7094
7095 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7096     MpegEncContext * const s = &h->s;
7097     unsigned int pps_id= get_ue_golomb(&s->gb);
7098     PPS *pps;
7099
7100     if(pps_id >= MAX_PPS_COUNT) {
7101         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7102         return -1;
7103     }
7104
7105     pps= av_mallocz(sizeof(PPS));
7106     if(pps == NULL)
7107         return -1;
7108     pps->sps_id= get_ue_golomb(&s->gb);
7109     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7110         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7111         goto fail;
7112     }
7113
7114     pps->cabac= get_bits1(&s->gb);
7115     pps->pic_order_present= get_bits1(&s->gb);
7116     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7117     if(pps->slice_group_count > 1 ){
7118         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7119         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7120         switch(pps->mb_slice_group_map_type){
7121         case 0:
7122 #if 0
7123 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7124 |    run_length[ i ]                                |1  |ue(v)   |
7125 #endif
7126             break;
7127         case 2:
7128 #if 0
7129 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7130 |{                                                  |   |        |
7131 |    top_left_mb[ i ]                               |1  |ue(v)   |
7132 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7133 |   }                                               |   |        |
7134 #endif
7135             break;
7136         case 3:
7137         case 4:
7138         case 5:
7139 #if 0
7140 |   slice_group_change_direction_flag               |1  |u(1)    |
7141 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7142 #endif
7143             break;
7144         case 6:
7145 #if 0
7146 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7147 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7148 |)                                                  |   |        |
7149 |    slice_group_id[ i ]                            |1  |u(v)    |
7150 #endif
7151             break;
7152         }
7153     }
7154     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7155     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7156     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7157         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7158         goto fail;
7159     }
7160
7161     pps->weighted_pred= get_bits1(&s->gb);
7162     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7163     pps->init_qp= get_se_golomb(&s->gb) + 26;
7164     pps->init_qs= get_se_golomb(&s->gb) + 26;
7165     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7166     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7167     pps->constrained_intra_pred= get_bits1(&s->gb);
7168     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7169
7170     pps->transform_8x8_mode= 0;
7171     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7172     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7173     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7174
7175     if(get_bits_count(&s->gb) < bit_length){
7176         pps->transform_8x8_mode= get_bits1(&s->gb);
7177         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7178         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7179     } else {
7180         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7181     }
7182
7183     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7184     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7185     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7186         h->pps.chroma_qp_diff= 1;
7187
7188     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7189         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7190                pps_id, pps->sps_id,
7191                pps->cabac ? "CABAC" : "CAVLC",
7192                pps->slice_group_count,
7193                pps->ref_count[0], pps->ref_count[1],
7194                pps->weighted_pred ? "weighted" : "",
7195                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7196                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7197                pps->constrained_intra_pred ? "CONSTR" : "",
7198                pps->redundant_pic_cnt_present ? "REDU" : "",
7199                pps->transform_8x8_mode ? "8x8DCT" : ""
7200                );
7201     }
7202
7203     av_free(h->pps_buffers[pps_id]);
7204     h->pps_buffers[pps_id]= pps;
7205     return 0;
7206 fail:
7207     av_free(pps);
7208     return -1;
7209 }
7210
7211 /**
7212  * Call decode_slice() for each context.
7213  *
7214  * @param h h264 master context
7215  * @param context_count number of contexts to execute
7216  */
7217 static void execute_decode_slices(H264Context *h, int context_count){
7218     MpegEncContext * const s = &h->s;
7219     AVCodecContext * const avctx= s->avctx;
7220     H264Context *hx;
7221     int i;
7222
7223     if(context_count == 1) {
7224         decode_slice(avctx, &h);
7225     } else {
7226         for(i = 1; i < context_count; i++) {
7227             hx = h->thread_context[i];
7228             hx->s.error_recognition = avctx->error_recognition;
7229             hx->s.error_count = 0;
7230         }
7231
7232         avctx->execute(avctx, (void *)decode_slice,
7233                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7234
7235         /* pull back stuff from slices to master context */
7236         hx = h->thread_context[context_count - 1];
7237         s->mb_x = hx->s.mb_x;
7238         s->mb_y = hx->s.mb_y;
7239         s->dropable = hx->s.dropable;
7240         s->picture_structure = hx->s.picture_structure;
7241         for(i = 1; i < context_count; i++)
7242             h->s.error_count += h->thread_context[i]->s.error_count;
7243     }
7244 }
7245
7246
7247 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7248     MpegEncContext * const s = &h->s;
7249     AVCodecContext * const avctx= s->avctx;
7250     int buf_index=0;
7251     H264Context *hx; ///< thread context
7252     int context_count = 0;
7253
7254     h->max_contexts = avctx->thread_count;
7255 #if 0
7256     int i;
7257     for(i=0; i<50; i++){
7258         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7259     }
7260 #endif
7261     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7262         h->current_slice = 0;
7263         if (!s->first_field)
7264             s->current_picture_ptr= NULL;
7265     }
7266
7267     for(;;){
7268         int consumed;
7269         int dst_length;
7270         int bit_length;
7271         const uint8_t *ptr;
7272         int i, nalsize = 0;
7273         int err;
7274
7275         if(h->is_avc) {
7276             if(buf_index >= buf_size) break;
7277             nalsize = 0;
7278             for(i = 0; i < h->nal_length_size; i++)
7279                 nalsize = (nalsize << 8) | buf[buf_index++];
7280             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7281                 if(nalsize == 1){
7282                     buf_index++;
7283                     continue;
7284                 }else{
7285                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7286                     break;
7287                 }
7288             }
7289         } else {
7290             // start code prefix search
7291             for(; buf_index + 3 < buf_size; buf_index++){
7292                 // This should always succeed in the first iteration.
7293                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7294                     break;
7295             }
7296
7297             if(buf_index+3 >= buf_size) break;
7298
7299             buf_index+=3;
7300         }
7301
7302         hx = h->thread_context[context_count];
7303
7304         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7305         if (ptr==NULL || dst_length < 0){
7306             return -1;
7307         }
7308         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7309             dst_length--;
7310         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7311
7312         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7313             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7314         }
7315
7316         if (h->is_avc && (nalsize != consumed)){
7317             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7318             consumed= nalsize;
7319         }
7320
7321         buf_index += consumed;
7322
7323         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7324            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7325             continue;
7326
7327       again:
7328         err = 0;
7329         switch(hx->nal_unit_type){
7330         case NAL_IDR_SLICE:
7331             if (h->nal_unit_type != NAL_IDR_SLICE) {
7332                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7333                 return -1;
7334             }
7335             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7336         case NAL_SLICE:
7337             init_get_bits(&hx->s.gb, ptr, bit_length);
7338             hx->intra_gb_ptr=
7339             hx->inter_gb_ptr= &hx->s.gb;
7340             hx->s.data_partitioning = 0;
7341
7342             if((err = decode_slice_header(hx, h)))
7343                break;
7344
7345             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7346             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7347                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7348                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7349                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7350                && avctx->skip_frame < AVDISCARD_ALL)
7351                 context_count++;
7352             break;
7353         case NAL_DPA:
7354             init_get_bits(&hx->s.gb, ptr, bit_length);
7355             hx->intra_gb_ptr=
7356             hx->inter_gb_ptr= NULL;
7357             hx->s.data_partitioning = 1;
7358
7359             err = decode_slice_header(hx, h);
7360             break;
7361         case NAL_DPB:
7362             init_get_bits(&hx->intra_gb, ptr, bit_length);
7363             hx->intra_gb_ptr= &hx->intra_gb;
7364             break;
7365         case NAL_DPC:
7366             init_get_bits(&hx->inter_gb, ptr, bit_length);
7367             hx->inter_gb_ptr= &hx->inter_gb;
7368
7369             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7370                && s->context_initialized
7371                && s->hurry_up < 5
7372                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7373                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7374                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7375                && avctx->skip_frame < AVDISCARD_ALL)
7376                 context_count++;
7377             break;
7378         case NAL_SEI:
7379             init_get_bits(&s->gb, ptr, bit_length);
7380             decode_sei(h);
7381             break;
7382         case NAL_SPS:
7383             init_get_bits(&s->gb, ptr, bit_length);
7384             decode_seq_parameter_set(h);
7385
7386             if(s->flags& CODEC_FLAG_LOW_DELAY)
7387                 s->low_delay=1;
7388
7389             if(avctx->has_b_frames < 2)
7390                 avctx->has_b_frames= !s->low_delay;
7391             break;
7392         case NAL_PPS:
7393             init_get_bits(&s->gb, ptr, bit_length);
7394
7395             decode_picture_parameter_set(h, bit_length);
7396
7397             break;
7398         case NAL_AUD:
7399         case NAL_END_SEQUENCE:
7400         case NAL_END_STREAM:
7401         case NAL_FILLER_DATA:
7402         case NAL_SPS_EXT:
7403         case NAL_AUXILIARY_SLICE:
7404             break;
7405         default:
7406             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7407         }
7408
7409         if(context_count == h->max_contexts) {
7410             execute_decode_slices(h, context_count);
7411             context_count = 0;
7412         }
7413
7414         if (err < 0)
7415             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7416         else if(err == 1) {
7417             /* Slice could not be decoded in parallel mode, copy down
7418              * NAL unit stuff to context 0 and restart. Note that
7419              * rbsp_buffer is not transferred, but since we no longer
7420              * run in parallel mode this should not be an issue. */
7421             h->nal_unit_type = hx->nal_unit_type;
7422             h->nal_ref_idc   = hx->nal_ref_idc;
7423             hx = h;
7424             goto again;
7425         }
7426     }
7427     if(context_count)
7428         execute_decode_slices(h, context_count);
7429     return buf_index;
7430 }
7431
7432 /**
7433  * returns the number of bytes consumed for building the current frame
7434  */
7435 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7436         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7437         if(pos+10>buf_size) pos=buf_size; // oops ;)
7438
7439         return pos;
7440 }
7441
7442 static int decode_frame(AVCodecContext *avctx,
7443                              void *data, int *data_size,
7444                              const uint8_t *buf, int buf_size)
7445 {
7446     H264Context *h = avctx->priv_data;
7447     MpegEncContext *s = &h->s;
7448     AVFrame *pict = data;
7449     int buf_index;
7450
7451     s->flags= avctx->flags;
7452     s->flags2= avctx->flags2;
7453
7454    /* end of stream, output what is still in the buffers */
7455     if (buf_size == 0) {
7456         Picture *out;
7457         int i, out_idx;
7458
7459 //FIXME factorize this with the output code below
7460         out = h->delayed_pic[0];
7461         out_idx = 0;
7462         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7463             if(h->delayed_pic[i]->poc < out->poc){
7464                 out = h->delayed_pic[i];
7465                 out_idx = i;
7466             }
7467
7468         for(i=out_idx; h->delayed_pic[i]; i++)
7469             h->delayed_pic[i] = h->delayed_pic[i+1];
7470
7471         if(out){
7472             *data_size = sizeof(AVFrame);
7473             *pict= *(AVFrame*)out;
7474         }
7475
7476         return 0;
7477     }
7478
7479     if(h->is_avc && !h->got_avcC) {
7480         int i, cnt, nalsize;
7481         unsigned char *p = avctx->extradata;
7482         if(avctx->extradata_size < 7) {
7483             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7484             return -1;
7485         }
7486         if(*p != 1) {
7487             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7488             return -1;
7489         }
7490         /* sps and pps in the avcC always have length coded with 2 bytes,
7491            so put a fake nal_length_size = 2 while parsing them */
7492         h->nal_length_size = 2;
7493         // Decode sps from avcC
7494         cnt = *(p+5) & 0x1f; // Number of sps
7495         p += 6;
7496         for (i = 0; i < cnt; i++) {
7497             nalsize = AV_RB16(p) + 2;
7498             if(decode_nal_units(h, p, nalsize) < 0) {
7499                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7500                 return -1;
7501             }
7502             p += nalsize;
7503         }
7504         // Decode pps from avcC
7505         cnt = *(p++); // Number of pps
7506         for (i = 0; i < cnt; i++) {
7507             nalsize = AV_RB16(p) + 2;
7508             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7509                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7510                 return -1;
7511             }
7512             p += nalsize;
7513         }
7514         // Now store right nal length size, that will be use to parse all other nals
7515         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7516         // Do not reparse avcC
7517         h->got_avcC = 1;
7518     }
7519
7520     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7521         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7522             return -1;
7523         h->got_avcC = 1;
7524     }
7525
7526     buf_index=decode_nal_units(h, buf, buf_size);
7527     if(buf_index < 0)
7528         return -1;
7529
7530     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7531         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7532         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7533         return -1;
7534     }
7535
7536     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7537         Picture *out = s->current_picture_ptr;
7538         Picture *cur = s->current_picture_ptr;
7539         int i, pics, cross_idr, out_of_order, out_idx;
7540
7541         s->mb_y= 0;
7542
7543         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7544         s->current_picture_ptr->pict_type= s->pict_type;
7545
7546         if(!s->dropable) {
7547             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7548             h->prev_poc_msb= h->poc_msb;
7549             h->prev_poc_lsb= h->poc_lsb;
7550         }
7551         h->prev_frame_num_offset= h->frame_num_offset;
7552         h->prev_frame_num= h->frame_num;
7553
7554         /*
7555          * FIXME: Error handling code does not seem to support interlaced
7556          * when slices span multiple rows
7557          * The ff_er_add_slice calls don't work right for bottom
7558          * fields; they cause massive erroneous error concealing
7559          * Error marking covers both fields (top and bottom).
7560          * This causes a mismatched s->error_count
7561          * and a bad error table. Further, the error count goes to
7562          * INT_MAX when called for bottom field, because mb_y is
7563          * past end by one (callers fault) and resync_mb_y != 0
7564          * causes problems for the first MB line, too.
7565          */
7566         if (!FIELD_PICTURE)
7567             ff_er_frame_end(s);
7568
7569         MPV_frame_end(s);
7570
7571         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7572             /* Wait for second field. */
7573             *data_size = 0;
7574
7575         } else {
7576             cur->repeat_pict = 0;
7577
7578             /* Signal interlacing information externally. */
7579             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7580             if(h->sps.pic_struct_present_flag){
7581                 switch (h->sei_pic_struct)
7582                 {
7583                 case SEI_PIC_STRUCT_FRAME:
7584                     cur->interlaced_frame = 0;
7585                     break;
7586                 case SEI_PIC_STRUCT_TOP_FIELD:
7587                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7588                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7589                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7590                     cur->interlaced_frame = 1;
7591                     break;
7592                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7593                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7594                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7595                     // From these hints, let the applications decide if they apply deinterlacing.
7596                     cur->repeat_pict = 1;
7597                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7598                     break;
7599                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7600                     // Force progressive here, as doubling interlaced frame is a bad idea.
7601                     cur->interlaced_frame = 0;
7602                     cur->repeat_pict = 2;
7603                     break;
7604                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7605                     cur->interlaced_frame = 0;
7606                     cur->repeat_pict = 4;
7607                     break;
7608                 }
7609             }else{
7610                 /* Derive interlacing flag from used decoding process. */
7611                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7612             }
7613
7614             if (cur->field_poc[0] != cur->field_poc[1]){
7615                 /* Derive top_field_first from field pocs. */
7616                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7617             }else{
7618                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7619                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7620                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7621                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7622                         cur->top_field_first = 1;
7623                     else
7624                         cur->top_field_first = 0;
7625                 }else{
7626                     /* Most likely progressive */
7627                     cur->top_field_first = 0;
7628                 }
7629             }
7630
7631         //FIXME do something with unavailable reference frames
7632
7633             /* Sort B-frames into display order */
7634
7635             if(h->sps.bitstream_restriction_flag
7636                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7637                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7638                 s->low_delay = 0;
7639             }
7640
7641             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7642                && !h->sps.bitstream_restriction_flag){
7643                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7644                 s->low_delay= 0;
7645             }
7646
7647             pics = 0;
7648             while(h->delayed_pic[pics]) pics++;
7649
7650             assert(pics <= MAX_DELAYED_PIC_COUNT);
7651
7652             h->delayed_pic[pics++] = cur;
7653             if(cur->reference == 0)
7654                 cur->reference = DELAYED_PIC_REF;
7655
7656             out = h->delayed_pic[0];
7657             out_idx = 0;
7658             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7659                 if(h->delayed_pic[i]->poc < out->poc){
7660                     out = h->delayed_pic[i];
7661                     out_idx = i;
7662                 }
7663             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7664
7665             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7666
7667             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7668                 { }
7669             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7670                || (s->low_delay &&
7671                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7672                  || cur->pict_type == FF_B_TYPE)))
7673             {
7674                 s->low_delay = 0;
7675                 s->avctx->has_b_frames++;
7676             }
7677
7678             if(out_of_order || pics > s->avctx->has_b_frames){
7679                 out->reference &= ~DELAYED_PIC_REF;
7680                 for(i=out_idx; h->delayed_pic[i]; i++)
7681                     h->delayed_pic[i] = h->delayed_pic[i+1];
7682             }
7683             if(!out_of_order && pics > s->avctx->has_b_frames){
7684                 *data_size = sizeof(AVFrame);
7685
7686                 h->outputed_poc = out->poc;
7687                 *pict= *(AVFrame*)out;
7688             }else{
7689                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7690             }
7691         }
7692     }
7693
7694     assert(pict->data[0] || !*data_size);
7695     ff_print_debug_info(s, pict);
7696 //printf("out %d\n", (int)pict->data[0]);
7697 #if 0 //?
7698
7699     /* Return the Picture timestamp as the frame number */
7700     /* we subtract 1 because it is added on utils.c     */
7701     avctx->frame_number = s->picture_number - 1;
7702 #endif
7703     return get_consumed_bytes(s, buf_index, buf_size);
7704 }
7705 #if 0
7706 static inline void fill_mb_avail(H264Context *h){
7707     MpegEncContext * const s = &h->s;
7708     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7709
7710     if(s->mb_y){
7711         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7712         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7713         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7714     }else{
7715         h->mb_avail[0]=
7716         h->mb_avail[1]=
7717         h->mb_avail[2]= 0;
7718     }
7719     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7720     h->mb_avail[4]= 1; //FIXME move out
7721     h->mb_avail[5]= 0; //FIXME move out
7722 }
7723 #endif
7724
7725 #ifdef TEST
7726 #undef printf
7727 #undef random
7728 #define COUNT 8000
7729 #define SIZE (COUNT*40)
7730 int main(void){
7731     int i;
7732     uint8_t temp[SIZE];
7733     PutBitContext pb;
7734     GetBitContext gb;
7735 //    int int_temp[10000];
7736     DSPContext dsp;
7737     AVCodecContext avctx;
7738
7739     dsputil_init(&dsp, &avctx);
7740
7741     init_put_bits(&pb, temp, SIZE);
7742     printf("testing unsigned exp golomb\n");
7743     for(i=0; i<COUNT; i++){
7744         START_TIMER
7745         set_ue_golomb(&pb, i);
7746         STOP_TIMER("set_ue_golomb");
7747     }
7748     flush_put_bits(&pb);
7749
7750     init_get_bits(&gb, temp, 8*SIZE);
7751     for(i=0; i<COUNT; i++){
7752         int j, s;
7753
7754         s= show_bits(&gb, 24);
7755
7756         START_TIMER
7757         j= get_ue_golomb(&gb);
7758         if(j != i){
7759             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7760 //            return -1;
7761         }
7762         STOP_TIMER("get_ue_golomb");
7763     }
7764
7765
7766     init_put_bits(&pb, temp, SIZE);
7767     printf("testing signed exp golomb\n");
7768     for(i=0; i<COUNT; i++){
7769         START_TIMER
7770         set_se_golomb(&pb, i - COUNT/2);
7771         STOP_TIMER("set_se_golomb");
7772     }
7773     flush_put_bits(&pb);
7774
7775     init_get_bits(&gb, temp, 8*SIZE);
7776     for(i=0; i<COUNT; i++){
7777         int j, s;
7778
7779         s= show_bits(&gb, 24);
7780
7781         START_TIMER
7782         j= get_se_golomb(&gb);
7783         if(j != i - COUNT/2){
7784             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7785 //            return -1;
7786         }
7787         STOP_TIMER("get_se_golomb");
7788     }
7789
7790 #if 0
7791     printf("testing 4x4 (I)DCT\n");
7792
7793     DCTELEM block[16];
7794     uint8_t src[16], ref[16];
7795     uint64_t error= 0, max_error=0;
7796
7797     for(i=0; i<COUNT; i++){
7798         int j;
7799 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7800         for(j=0; j<16; j++){
7801             ref[j]= random()%255;
7802             src[j]= random()%255;
7803         }
7804
7805         h264_diff_dct_c(block, src, ref, 4);
7806
7807         //normalize
7808         for(j=0; j<16; j++){
7809 //            printf("%d ", block[j]);
7810             block[j]= block[j]*4;
7811             if(j&1) block[j]= (block[j]*4 + 2)/5;
7812             if(j&4) block[j]= (block[j]*4 + 2)/5;
7813         }
7814 //        printf("\n");
7815
7816         s->dsp.h264_idct_add(ref, block, 4);
7817 /*        for(j=0; j<16; j++){
7818             printf("%d ", ref[j]);
7819         }
7820         printf("\n");*/
7821
7822         for(j=0; j<16; j++){
7823             int diff= FFABS(src[j] - ref[j]);
7824
7825             error+= diff*diff;
7826             max_error= FFMAX(max_error, diff);
7827         }
7828     }
7829     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7830     printf("testing quantizer\n");
7831     for(qp=0; qp<52; qp++){
7832         for(i=0; i<16; i++)
7833             src1_block[i]= src2_block[i]= random()%255;
7834
7835     }
7836     printf("Testing NAL layer\n");
7837
7838     uint8_t bitstream[COUNT];
7839     uint8_t nal[COUNT*2];
7840     H264Context h;
7841     memset(&h, 0, sizeof(H264Context));
7842
7843     for(i=0; i<COUNT; i++){
7844         int zeros= i;
7845         int nal_length;
7846         int consumed;
7847         int out_length;
7848         uint8_t *out;
7849         int j;
7850
7851         for(j=0; j<COUNT; j++){
7852             bitstream[j]= (random() % 255) + 1;
7853         }
7854
7855         for(j=0; j<zeros; j++){
7856             int pos= random() % COUNT;
7857             while(bitstream[pos] == 0){
7858                 pos++;
7859                 pos %= COUNT;
7860             }
7861             bitstream[pos]=0;
7862         }
7863
7864         START_TIMER
7865
7866         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7867         if(nal_length<0){
7868             printf("encoding failed\n");
7869             return -1;
7870         }
7871
7872         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7873
7874         STOP_TIMER("NAL")
7875
7876         if(out_length != COUNT){
7877             printf("incorrect length %d %d\n", out_length, COUNT);
7878             return -1;
7879         }
7880
7881         if(consumed != nal_length){
7882             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7883             return -1;
7884         }
7885
7886         if(memcmp(bitstream, out, COUNT)){
7887             printf("mismatch\n");
7888             return -1;
7889         }
7890     }
7891 #endif
7892
7893     printf("Testing RBSP\n");
7894
7895
7896     return 0;
7897 }
7898 #endif /* TEST */
7899
7900
7901 static av_cold int decode_end(AVCodecContext *avctx)
7902 {
7903     H264Context *h = avctx->priv_data;
7904     MpegEncContext *s = &h->s;
7905     int i;
7906
7907     av_freep(&h->rbsp_buffer[0]);
7908     av_freep(&h->rbsp_buffer[1]);
7909     free_tables(h); //FIXME cleanup init stuff perhaps
7910
7911     for(i = 0; i < MAX_SPS_COUNT; i++)
7912         av_freep(h->sps_buffers + i);
7913
7914     for(i = 0; i < MAX_PPS_COUNT; i++)
7915         av_freep(h->pps_buffers + i);
7916
7917     MPV_common_end(s);
7918
7919 //    memset(h, 0, sizeof(H264Context));
7920
7921     return 0;
7922 }
7923
7924
7925 AVCodec h264_decoder = {
7926     "h264",
7927     CODEC_TYPE_VIDEO,
7928     CODEC_ID_H264,
7929     sizeof(H264Context),
7930     decode_init,
7931     NULL,
7932     decode_end,
7933     decode_frame,
7934     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7935     .flush= flush_dpb,
7936     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7937 };
7938
7939 #include "svq3.c"