git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1591                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1592                            int src_x_offset, int src_y_offset,
1593                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1594     MpegEncContext * const s = &h->s;
1595     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1596     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1597     const int luma_xy= (mx&3) + ((my&3)<<2);
1598     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1599     uint8_t * src_cb, * src_cr;
1600     int extra_width= h->emu_edge_width;
1601     int extra_height= h->emu_edge_height;
1602     int emu=0;
1603     const int full_mx= mx>>2;
1604     const int full_my= my>>2;
1605     const int pic_width  = 16*s->mb_width;
1606     const int pic_height = 16*s->mb_height >> MB_FIELD;
1607
1608     if(mx&7) extra_width -= 3;
1609     if(my&7) extra_height -= 3;
1610
1611     if(   full_mx < 0-extra_width
1612        || full_my < 0-extra_height
1613        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1614        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1615         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1616             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1617         emu=1;
1618     }
1619
1620     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1621     if(!square){
1622         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1623     }
1624
1625     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1626
1627     if(MB_FIELD){
1628         // chroma offset when predicting from a field of opposite parity
1629         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1630         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1631     }
1632     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1633     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1634
1635     if(emu){
1636         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1637             src_cb= s->edge_emu_buffer;
1638     }
1639     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1640
1641     if(emu){
1642         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1643             src_cr= s->edge_emu_buffer;
1644     }
1645     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1646 }
1647
1648 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1649                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1650                            int x_offset, int y_offset,
1651                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1652                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1653                            int list0, int list1){
1654     MpegEncContext * const s = &h->s;
1655     qpel_mc_func *qpix_op=  qpix_put;
1656     h264_chroma_mc_func chroma_op= chroma_put;
1657
1658     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1659     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1660     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1661     x_offset += 8*s->mb_x;
1662     y_offset += 8*(s->mb_y >> MB_FIELD);
1663
1664     if(list0){
1665         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1666         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1667                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1668                            qpix_op, chroma_op);
1669
1670         qpix_op=  qpix_avg;
1671         chroma_op= chroma_avg;
1672     }
1673
1674     if(list1){
1675         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1676         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1677                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1678                            qpix_op, chroma_op);
1679     }
1680 }
1681
1682 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1683                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1684                            int x_offset, int y_offset,
1685                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1686                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1687                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1688                            int list0, int list1){
1689     MpegEncContext * const s = &h->s;
1690
1691     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1692     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1693     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1694     x_offset += 8*s->mb_x;
1695     y_offset += 8*(s->mb_y >> MB_FIELD);
1696
1697     if(list0 && list1){
1698         /* don't optimize for luma-only case, since B-frames usually
1699          * use implicit weights => chroma too. */
1700         uint8_t *tmp_cb = s->obmc_scratchpad;
1701         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1702         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1703         int refn0 = h->ref_cache[0][ scan8[n] ];
1704         int refn1 = h->ref_cache[1][ scan8[n] ];
1705
1706         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1707                     dest_y, dest_cb, dest_cr,
1708                     x_offset, y_offset, qpix_put, chroma_put);
1709         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1710                     tmp_y, tmp_cb, tmp_cr,
1711                     x_offset, y_offset, qpix_put, chroma_put);
1712
1713         if(h->use_weight == 2){
1714             int weight0 = h->implicit_weight[refn0][refn1];
1715             int weight1 = 64 - weight0;
1716             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1717             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1718             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1719         }else{
1720             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1721                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1722                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1723             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1724                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1725                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1726             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1727                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1728                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1729         }
1730     }else{
1731         int list = list1 ? 1 : 0;
1732         int refn = h->ref_cache[list][ scan8[n] ];
1733         Picture *ref= &h->ref_list[list][refn];
1734         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1735                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1736                     qpix_put, chroma_put);
1737
1738         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1739                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1740         if(h->use_weight_chroma){
1741             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1742                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1743             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1745         }
1746     }
1747 }
1748
1749 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1750                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1751                            int x_offset, int y_offset,
1752                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1753                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1754                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1755                            int list0, int list1){
1756     if((h->use_weight==2 && list0 && list1
1757         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1758        || h->use_weight==1)
1759         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1760                          x_offset, y_offset, qpix_put, chroma_put,
1761                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1762     else
1763         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1764                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1765 }
1766
1767 static inline void prefetch_motion(H264Context *h, int list){
1768     /* fetch pixels for estimated mv 4 macroblocks ahead
1769      * optimized for 64byte cache lines */
1770     MpegEncContext * const s = &h->s;
1771     const int refn = h->ref_cache[list][scan8[0]];
1772     if(refn >= 0){
1773         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1774         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1775         uint8_t **src= h->ref_list[list][refn].data;
1776         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1777         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1778         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1779         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1780     }
1781 }
1782
1783 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1784                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1785                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1786                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1787     MpegEncContext * const s = &h->s;
1788     const int mb_xy= h->mb_xy;
1789     const int mb_type= s->current_picture.mb_type[mb_xy];
1790
1791     assert(IS_INTER(mb_type));
1792
1793     prefetch_motion(h, 0);
1794
1795     if(IS_16X16(mb_type)){
1796         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1797                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1798                 &weight_op[0], &weight_avg[0],
1799                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1800     }else if(IS_16X8(mb_type)){
1801         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1802                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1803                 &weight_op[1], &weight_avg[1],
1804                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1805         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1806                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1807                 &weight_op[1], &weight_avg[1],
1808                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1809     }else if(IS_8X16(mb_type)){
1810         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1811                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1812                 &weight_op[2], &weight_avg[2],
1813                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1814         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1815                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1816                 &weight_op[2], &weight_avg[2],
1817                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1818     }else{
1819         int i;
1820
1821         assert(IS_8X8(mb_type));
1822
1823         for(i=0; i<4; i++){
1824             const int sub_mb_type= h->sub_mb_type[i];
1825             const int n= 4*i;
1826             int x_offset= (i&1)<<2;
1827             int y_offset= (i&2)<<1;
1828
1829             if(IS_SUB_8X8(sub_mb_type)){
1830                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1831                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1832                     &weight_op[3], &weight_avg[3],
1833                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1834             }else if(IS_SUB_8X4(sub_mb_type)){
1835                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1836                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1837                     &weight_op[4], &weight_avg[4],
1838                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1839                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1840                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1841                     &weight_op[4], &weight_avg[4],
1842                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1843             }else if(IS_SUB_4X8(sub_mb_type)){
1844                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1845                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1846                     &weight_op[5], &weight_avg[5],
1847                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1848                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1849                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1850                     &weight_op[5], &weight_avg[5],
1851                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1852             }else{
1853                 int j;
1854                 assert(IS_SUB_4X4(sub_mb_type));
1855                 for(j=0; j<4; j++){
1856                     int sub_x_offset= x_offset + 2*(j&1);
1857                     int sub_y_offset= y_offset +   (j&2);
1858                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1859                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1860                         &weight_op[6], &weight_avg[6],
1861                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862                 }
1863             }
1864         }
1865     }
1866
1867     prefetch_motion(h, 1);
1868 }
1869
1870 static av_cold void decode_init_vlc(void){
1871     static int done = 0;
1872
1873     if (!done) {
1874         int i;
1875         int offset;
1876         done = 1;
1877
1878         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1879         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1880         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1881                  &chroma_dc_coeff_token_len [0], 1, 1,
1882                  &chroma_dc_coeff_token_bits[0], 1, 1,
1883                  INIT_VLC_USE_NEW_STATIC);
1884
1885         offset = 0;
1886         for(i=0; i<4; i++){
1887             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1888             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1889             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1890                      &coeff_token_len [i][0], 1, 1,
1891                      &coeff_token_bits[i][0], 1, 1,
1892                      INIT_VLC_USE_NEW_STATIC);
1893             offset += coeff_token_vlc_tables_size[i];
1894         }
1895         /*
1896          * This is a one time safety check to make sure that
1897          * the packed static coeff_token_vlc table sizes
1898          * were initialized correctly.
1899          */
1900         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1901
1902         for(i=0; i<3; i++){
1903             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1904             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1905             init_vlc(&chroma_dc_total_zeros_vlc[i],
1906                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1907                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1908                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1909                      INIT_VLC_USE_NEW_STATIC);
1910         }
1911         for(i=0; i<15; i++){
1912             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1913             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1914             init_vlc(&total_zeros_vlc[i],
1915                      TOTAL_ZEROS_VLC_BITS, 16,
1916                      &total_zeros_len [i][0], 1, 1,
1917                      &total_zeros_bits[i][0], 1, 1,
1918                      INIT_VLC_USE_NEW_STATIC);
1919         }
1920
1921         for(i=0; i<6; i++){
1922             run_vlc[i].table = run_vlc_tables[i];
1923             run_vlc[i].table_allocated = run_vlc_tables_size;
1924             init_vlc(&run_vlc[i],
1925                      RUN_VLC_BITS, 7,
1926                      &run_len [i][0], 1, 1,
1927                      &run_bits[i][0], 1, 1,
1928                      INIT_VLC_USE_NEW_STATIC);
1929         }
1930         run7_vlc.table = run7_vlc_table,
1931         run7_vlc.table_allocated = run7_vlc_table_size;
1932         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1933                  &run_len [6][0], 1, 1,
1934                  &run_bits[6][0], 1, 1,
1935                  INIT_VLC_USE_NEW_STATIC);
1936     }
1937 }
1938
1939 static void free_tables(H264Context *h){
1940     int i;
1941     H264Context *hx;
1942     av_freep(&h->intra4x4_pred_mode);
1943     av_freep(&h->chroma_pred_mode_table);
1944     av_freep(&h->cbp_table);
1945     av_freep(&h->mvd_table[0]);
1946     av_freep(&h->mvd_table[1]);
1947     av_freep(&h->direct_table);
1948     av_freep(&h->non_zero_count);
1949     av_freep(&h->slice_table_base);
1950     h->slice_table= NULL;
1951
1952     av_freep(&h->mb2b_xy);
1953     av_freep(&h->mb2b8_xy);
1954
1955     for(i = 0; i < h->s.avctx->thread_count; i++) {
1956         hx = h->thread_context[i];
1957         if(!hx) continue;
1958         av_freep(&hx->top_borders[1]);
1959         av_freep(&hx->top_borders[0]);
1960         av_freep(&hx->s.obmc_scratchpad);
1961     }
1962 }
1963
1964 static void init_dequant8_coeff_table(H264Context *h){
1965     int i,q,x;
1966     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1967     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1968     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1969
1970     for(i=0; i<2; i++ ){
1971         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1972             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1973             break;
1974         }
1975
1976         for(q=0; q<52; q++){
1977             int shift = div6[q];
1978             int idx = rem6[q];
1979             for(x=0; x<64; x++)
1980                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1981                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1982                     h->pps.scaling_matrix8[i][x]) << shift;
1983         }
1984     }
1985 }
1986
1987 static void init_dequant4_coeff_table(H264Context *h){
1988     int i,j,q,x;
1989     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1990     for(i=0; i<6; i++ ){
1991         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1992         for(j=0; j<i; j++){
1993             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1994                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1995                 break;
1996             }
1997         }
1998         if(j<i)
1999             continue;
2000
2001         for(q=0; q<52; q++){
2002             int shift = div6[q] + 2;
2003             int idx = rem6[q];
2004             for(x=0; x<16; x++)
2005                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2006                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2007                     h->pps.scaling_matrix4[i][x]) << shift;
2008         }
2009     }
2010 }
2011
2012 static void init_dequant_tables(H264Context *h){
2013     int i,x;
2014     init_dequant4_coeff_table(h);
2015     if(h->pps.transform_8x8_mode)
2016         init_dequant8_coeff_table(h);
2017     if(h->sps.transform_bypass){
2018         for(i=0; i<6; i++)
2019             for(x=0; x<16; x++)
2020                 h->dequant4_coeff[i][0][x] = 1<<6;
2021         if(h->pps.transform_8x8_mode)
2022             for(i=0; i<2; i++)
2023                 for(x=0; x<64; x++)
2024                     h->dequant8_coeff[i][0][x] = 1<<6;
2025     }
2026 }
2027
2028
2029 /**
2030  * allocates tables.
2031  * needs width/height
2032  */
2033 static int alloc_tables(H264Context *h){
2034     MpegEncContext * const s = &h->s;
2035     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2036     int x,y;
2037
2038     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2039
2040     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2041     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2042     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2043
2044     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2045     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2046     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2047     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2048
2049     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2050     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2051
2052     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2053     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2054     for(y=0; y<s->mb_height; y++){
2055         for(x=0; x<s->mb_width; x++){
2056             const int mb_xy= x + y*s->mb_stride;
2057             const int b_xy = 4*x + 4*y*h->b_stride;
2058             const int b8_xy= 2*x + 2*y*h->b8_stride;
2059
2060             h->mb2b_xy [mb_xy]= b_xy;
2061             h->mb2b8_xy[mb_xy]= b8_xy;
2062         }
2063     }
2064
2065     s->obmc_scratchpad = NULL;
2066
2067     if(!h->dequant4_coeff[0])
2068         init_dequant_tables(h);
2069
2070     return 0;
2071 fail:
2072     free_tables(h);
2073     return -1;
2074 }
2075
2076 /**
2077  * Mimic alloc_tables(), but for every context thread.
2078  */
2079 static void clone_tables(H264Context *dst, H264Context *src){
2080     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2081     dst->non_zero_count           = src->non_zero_count;
2082     dst->slice_table              = src->slice_table;
2083     dst->cbp_table                = src->cbp_table;
2084     dst->mb2b_xy                  = src->mb2b_xy;
2085     dst->mb2b8_xy                 = src->mb2b8_xy;
2086     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2087     dst->mvd_table[0]             = src->mvd_table[0];
2088     dst->mvd_table[1]             = src->mvd_table[1];
2089     dst->direct_table             = src->direct_table;
2090
2091     dst->s.obmc_scratchpad = NULL;
2092     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2093 }
2094
2095 /**
2096  * Init context
2097  * Allocate buffers which are not shared amongst multiple threads.
2098  */
2099 static int context_init(H264Context *h){
2100     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2101     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2102
2103     return 0;
2104 fail:
2105     return -1; // free_tables will clean up for us
2106 }
2107
2108 static av_cold void common_init(H264Context *h){
2109     MpegEncContext * const s = &h->s;
2110
2111     s->width = s->avctx->width;
2112     s->height = s->avctx->height;
2113     s->codec_id= s->avctx->codec->id;
2114
2115     ff_h264_pred_init(&h->hpc, s->codec_id);
2116
2117     h->dequant_coeff_pps= -1;
2118     s->unrestricted_mv=1;
2119     s->decode=1; //FIXME
2120
2121     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2122     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2123 }
2124
2125 static av_cold int decode_init(AVCodecContext *avctx){
2126     H264Context *h= avctx->priv_data;
2127     MpegEncContext * const s = &h->s;
2128
2129     MPV_decode_defaults(s);
2130
2131     s->avctx = avctx;
2132     common_init(h);
2133
2134     s->out_format = FMT_H264;
2135     s->workaround_bugs= avctx->workaround_bugs;
2136
2137     // set defaults
2138 //    s->decode_mb= ff_h263_decode_mb;
2139     s->quarter_sample = 1;
2140     s->low_delay= 1;
2141
2142     if(avctx->codec_id == CODEC_ID_SVQ3)
2143         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2144     else
2145         avctx->pix_fmt= PIX_FMT_YUV420P;
2146
2147     decode_init_vlc();
2148
2149     if(avctx->extradata_size > 0 && avctx->extradata &&
2150        *(char *)avctx->extradata == 1){
2151         h->is_avc = 1;
2152         h->got_avcC = 0;
2153     } else {
2154         h->is_avc = 0;
2155     }
2156
2157     h->thread_context[0] = h;
2158     h->outputed_poc = INT_MIN;
2159     h->prev_poc_msb= 1<<16;
2160     return 0;
2161 }
2162
2163 static int frame_start(H264Context *h){
2164     MpegEncContext * const s = &h->s;
2165     int i;
2166
2167     if(MPV_frame_start(s, s->avctx) < 0)
2168         return -1;
2169     ff_er_frame_start(s);
2170     /*
2171      * MPV_frame_start uses pict_type to derive key_frame.
2172      * This is incorrect for H.264; IDR markings must be used.
2173      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2174      * See decode_nal_units().
2175      */
2176     s->current_picture_ptr->key_frame= 0;
2177
2178     assert(s->linesize && s->uvlinesize);
2179
2180     for(i=0; i<16; i++){
2181         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2182         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2183     }
2184     for(i=0; i<4; i++){
2185         h->block_offset[16+i]=
2186         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2187         h->block_offset[24+16+i]=
2188         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2189     }
2190
2191     /* can't be in alloc_tables because linesize isn't known there.
2192      * FIXME: redo bipred weight to not require extra buffer? */
2193     for(i = 0; i < s->avctx->thread_count; i++)
2194         if(!h->thread_context[i]->s.obmc_scratchpad)
2195             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2196
2197     /* some macroblocks will be accessed before they're available */
2198     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2199         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2200
2201 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2202
2203     // We mark the current picture as non-reference after allocating it, so
2204     // that if we break out due to an error it can be released automatically
2205     // in the next MPV_frame_start().
2206     // SVQ3 as well as most other codecs have only last/next/current and thus
2207     // get released even with set reference, besides SVQ3 and others do not
2208     // mark frames as reference later "naturally".
2209     if(s->codec_id != CODEC_ID_SVQ3)
2210         s->current_picture_ptr->reference= 0;
2211
2212     s->current_picture_ptr->field_poc[0]=
2213     s->current_picture_ptr->field_poc[1]= INT_MAX;
2214     assert(s->current_picture_ptr->long_ref==0);
2215
2216     return 0;
2217 }
2218
2219 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222     int step    = 1;
2223     int offset  = 1;
2224     int uvoffset= 1;
2225     int top_idx = 1;
2226     int skiplast= 0;
2227
2228     src_y  -=   linesize;
2229     src_cb -= uvlinesize;
2230     src_cr -= uvlinesize;
2231
2232     if(!simple && FRAME_MBAFF){
2233         if(s->mb_y&1){
2234             offset  = MB_MBAFF ? 1 : 17;
2235             uvoffset= MB_MBAFF ? 1 : 9;
2236             if(!MB_MBAFF){
2237                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2238                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2239                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2240                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2241                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2242                 }
2243             }
2244         }else{
2245             if(!MB_MBAFF){
2246                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2247                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2248                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2249                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2250                 }
2251                 skiplast= 1;
2252             }
2253             offset  =
2254             uvoffset=
2255             top_idx = MB_MBAFF ? 0 : 1;
2256         }
2257         step= MB_MBAFF ? 2 : 1;
2258     }
2259
2260     // There are two lines saved, the line above the the top macroblock of a pair,
2261     // and the line above the bottom macroblock
2262     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2263     for(i=1; i<17 - skiplast; i++){
2264         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2265     }
2266
2267     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2268     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2269
2270     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2271         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2272         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2273         for(i=1; i<9 - skiplast; i++){
2274             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2275             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2276         }
2277         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2278         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2279     }
2280 }
2281
2282 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2283     MpegEncContext * const s = &h->s;
2284     int temp8, i;
2285     uint64_t temp64;
2286     int deblock_left;
2287     int deblock_top;
2288     int mb_xy;
2289     int step    = 1;
2290     int offset  = 1;
2291     int uvoffset= 1;
2292     int top_idx = 1;
2293
2294     if(!simple && FRAME_MBAFF){
2295         if(s->mb_y&1){
2296             offset  = MB_MBAFF ? 1 : 17;
2297             uvoffset= MB_MBAFF ? 1 : 9;
2298         }else{
2299             offset  =
2300             uvoffset=
2301             top_idx = MB_MBAFF ? 0 : 1;
2302         }
2303         step= MB_MBAFF ? 2 : 1;
2304     }
2305
2306     if(h->deblocking_filter == 2) {
2307         mb_xy = h->mb_xy;
2308         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2309         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2310     } else {
2311         deblock_left = (s->mb_x > 0);
2312         deblock_top =  (s->mb_y > !!MB_FIELD);
2313     }
2314
2315     src_y  -=   linesize + 1;
2316     src_cb -= uvlinesize + 1;
2317     src_cr -= uvlinesize + 1;
2318
2319 #define XCHG(a,b,t,xchg)\
2320 t= a;\
2321 if(xchg)\
2322     a= b;\
2323 b= t;
2324
2325     if(deblock_left){
2326         for(i = !deblock_top; i<16; i++){
2327             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2328         }
2329         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2330     }
2331
2332     if(deblock_top){
2333         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2334         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2335         if(s->mb_x+1 < s->mb_width){
2336             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2337         }
2338     }
2339
2340     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2341         if(deblock_left){
2342             for(i = !deblock_top; i<8; i++){
2343                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2344                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2345             }
2346             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2347             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2348         }
2349         if(deblock_top){
2350             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2351             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2352         }
2353     }
2354 }
2355
2356 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2357     MpegEncContext * const s = &h->s;
2358     const int mb_x= s->mb_x;
2359     const int mb_y= s->mb_y;
2360     const int mb_xy= h->mb_xy;
2361     const int mb_type= s->current_picture.mb_type[mb_xy];
2362     uint8_t  *dest_y, *dest_cb, *dest_cr;
2363     int linesize, uvlinesize /*dct_offset*/;
2364     int i;
2365     int *block_offset = &h->block_offset[0];
2366     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2367     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2368     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2369     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2370
2371     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2372     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2373     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2374
2375     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2376     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2377
2378     if (!simple && MB_FIELD) {
2379         linesize   = h->mb_linesize   = s->linesize * 2;
2380         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2381         block_offset = &h->block_offset[24];
2382         if(mb_y&1){ //FIXME move out of this function?
2383             dest_y -= s->linesize*15;
2384             dest_cb-= s->uvlinesize*7;
2385             dest_cr-= s->uvlinesize*7;
2386         }
2387         if(FRAME_MBAFF) {
2388             int list;
2389             for(list=0; list<h->list_count; list++){
2390                 if(!USES_LIST(mb_type, list))
2391                     continue;
2392                 if(IS_16X16(mb_type)){
2393                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2394                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2395                 }else{
2396                     for(i=0; i<16; i+=4){
2397                         int ref = h->ref_cache[list][scan8[i]];
2398                         if(ref >= 0)
2399                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2400                     }
2401                 }
2402             }
2403         }
2404     } else {
2405         linesize   = h->mb_linesize   = s->linesize;
2406         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2407 //        dct_offset = s->linesize * 16;
2408     }
2409
2410     if (!simple && IS_INTRA_PCM(mb_type)) {
2411         for (i=0; i<16; i++) {
2412             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2413         }
2414         for (i=0; i<8; i++) {
2415             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2416             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2417         }
2418     } else {
2419         if(IS_INTRA(mb_type)){
2420             if(h->deblocking_filter)
2421                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2422
2423             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2424                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2425                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2426             }
2427
2428             if(IS_INTRA4x4(mb_type)){
2429                 if(simple || !s->encoding){
2430                     if(IS_8x8DCT(mb_type)){
2431                         if(transform_bypass){
2432                             idct_dc_add =
2433                             idct_add    = s->dsp.add_pixels8;
2434                         }else{
2435                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2436                             idct_add    = s->dsp.h264_idct8_add;
2437                         }
2438                         for(i=0; i<16; i+=4){
2439                             uint8_t * const ptr= dest_y + block_offset[i];
2440                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2441                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2442                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2443                             }else{
2444                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2445                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2446                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2447                                 if(nnz){
2448                                     if(nnz == 1 && h->mb[i*16])
2449                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2450                                     else
2451                                         idct_add   (ptr, h->mb + i*16, linesize);
2452                                 }
2453                             }
2454                         }
2455                     }else{
2456                         if(transform_bypass){
2457                             idct_dc_add =
2458                             idct_add    = s->dsp.add_pixels4;
2459                         }else{
2460                             idct_dc_add = s->dsp.h264_idct_dc_add;
2461                             idct_add    = s->dsp.h264_idct_add;
2462                         }
2463                         for(i=0; i<16; i++){
2464                             uint8_t * const ptr= dest_y + block_offset[i];
2465                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2466
2467                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2468                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2469                             }else{
2470                                 uint8_t *topright;
2471                                 int nnz, tr;
2472                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2473                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2474                                     assert(mb_y || linesize <= block_offset[i]);
2475                                     if(!topright_avail){
2476                                         tr= ptr[3 - linesize]*0x01010101;
2477                                         topright= (uint8_t*) &tr;
2478                                     }else
2479                                         topright= ptr + 4 - linesize;
2480                                 }else
2481                                     topright= NULL;
2482
2483                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2484                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2485                                 if(nnz){
2486                                     if(is_h264){
2487                                         if(nnz == 1 && h->mb[i*16])
2488                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2489                                         else
2490                                             idct_add   (ptr, h->mb + i*16, linesize);
2491                                     }else
2492                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2493                                 }
2494                             }
2495                         }
2496                     }
2497                 }
2498             }else{
2499                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2500                 if(is_h264){
2501                     if(!transform_bypass)
2502                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2503                 }else
2504                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2505             }
2506             if(h->deblocking_filter)
2507                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2508         }else if(is_h264){
2509             hl_motion(h, dest_y, dest_cb, dest_cr,
2510                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2511                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2512                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2513         }
2514
2515
2516         if(!IS_INTRA4x4(mb_type)){
2517             if(is_h264){
2518                 if(IS_INTRA16x16(mb_type)){
2519                     if(transform_bypass){
2520                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2521                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2522                         }else{
2523                             for(i=0; i<16; i++){
2524                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2525                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2526                             }
2527                         }
2528                     }else{
2529                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2530                     }
2531                 }else if(h->cbp&15){
2532                     if(transform_bypass){
2533                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2534                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2535                         for(i=0; i<16; i+=di){
2536                             if(h->non_zero_count_cache[ scan8[i] ]){
2537                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2538                             }
2539                         }
2540                     }else{
2541                         if(IS_8x8DCT(mb_type)){
2542                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2543                         }else{
2544                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2545                         }
2546                     }
2547                 }
2548             }else{
2549                 for(i=0; i<16; i++){
2550                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2551                         uint8_t * const ptr= dest_y + block_offset[i];
2552                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2553                     }
2554                 }
2555             }
2556         }
2557
2558         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2559             uint8_t *dest[2] = {dest_cb, dest_cr};
2560             if(transform_bypass){
2561                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2562                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2563                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2564                 }else{
2565                     idct_add = s->dsp.add_pixels4;
2566                     for(i=16; i<16+8; i++){
2567                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2568                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2569                     }
2570                 }
2571             }else{
2572                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2573                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2574                 if(is_h264){
2575                     idct_add = s->dsp.h264_idct_add;
2576                     idct_dc_add = s->dsp.h264_idct_dc_add;
2577                     for(i=16; i<16+8; i++){
2578                         if(h->non_zero_count_cache[ scan8[i] ])
2579                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2580                         else if(h->mb[i*16])
2581                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2582                     }
2583                 }else{
2584                     for(i=16; i<16+8; i++){
2585                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2586                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2587                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2588                         }
2589                     }
2590                 }
2591             }
2592         }
2593     }
2594     if(h->deblocking_filter) {
2595         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2596         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2597         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2598         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2599         if (!simple && FRAME_MBAFF) {
2600             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2601         } else {
2602             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2603         }
2604     }
2605 }
2606
2607 /**
2608  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2609  */
2610 static void hl_decode_mb_simple(H264Context *h){
2611     hl_decode_mb_internal(h, 1);
2612 }
2613
2614 /**
2615  * Process a macroblock; this handles edge cases, such as interlacing.
2616  */
2617 static void av_noinline hl_decode_mb_complex(H264Context *h){
2618     hl_decode_mb_internal(h, 0);
2619 }
2620
2621 static void hl_decode_mb(H264Context *h){
2622     MpegEncContext * const s = &h->s;
2623     const int mb_xy= h->mb_xy;
2624     const int mb_type= s->current_picture.mb_type[mb_xy];
2625     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2626
2627     if(ENABLE_H264_ENCODER && !s->decode)
2628         return;
2629
2630     if (is_complex)
2631         hl_decode_mb_complex(h);
2632     else hl_decode_mb_simple(h);
2633 }
2634
2635 static void pic_as_field(Picture *pic, const int parity){
2636     int i;
2637     for (i = 0; i < 4; ++i) {
2638         if (parity == PICT_BOTTOM_FIELD)
2639             pic->data[i] += pic->linesize[i];
2640         pic->reference = parity;
2641         pic->linesize[i] *= 2;
2642     }
2643     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2644 }
2645
2646 static int split_field_copy(Picture *dest, Picture *src,
2647                             int parity, int id_add){
2648     int match = !!(src->reference & parity);
2649
2650     if (match) {
2651         *dest = *src;
2652         if(parity != PICT_FRAME){
2653             pic_as_field(dest, parity);
2654             dest->pic_id *= 2;
2655             dest->pic_id += id_add;
2656         }
2657     }
2658
2659     return match;
2660 }
2661
2662 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2663     int i[2]={0};
2664     int index=0;
2665
2666     while(i[0]<len || i[1]<len){
2667         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2668             i[0]++;
2669         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2670             i[1]++;
2671         if(i[0] < len){
2672             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2673             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2674         }
2675         if(i[1] < len){
2676             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2677             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2678         }
2679     }
2680
2681     return index;
2682 }
2683
2684 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2685     int i, best_poc;
2686     int out_i= 0;
2687
2688     for(;;){
2689         best_poc= dir ? INT_MIN : INT_MAX;
2690
2691         for(i=0; i<len; i++){
2692             const int poc= src[i]->poc;
2693             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2694                 best_poc= poc;
2695                 sorted[out_i]= src[i];
2696             }
2697         }
2698         if(best_poc == (dir ? INT_MIN : INT_MAX))
2699             break;
2700         limit= sorted[out_i++]->poc - dir;
2701     }
2702     return out_i;
2703 }
2704
2705 /**
2706  * fills the default_ref_list.
2707  */
2708 static int fill_default_ref_list(H264Context *h){
2709     MpegEncContext * const s = &h->s;
2710     int i, len;
2711
2712     if(h->slice_type_nos==FF_B_TYPE){
2713         Picture *sorted[32];
2714         int cur_poc, list;
2715         int lens[2];
2716
2717         if(FIELD_PICTURE)
2718             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2719         else
2720             cur_poc= s->current_picture_ptr->poc;
2721
2722         for(list= 0; list<2; list++){
2723             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2724             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2725             assert(len<=32);
2726             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2727             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2728             assert(len<=32);
2729
2730             if(len < h->ref_count[list])
2731                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2732             lens[list]= len;
2733         }
2734
2735         if(lens[0] == lens[1] && lens[1] > 1){
2736             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2737             if(i == lens[0])
2738                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2739         }
2740     }else{
2741         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2742         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2743         assert(len <= 32);
2744         if(len < h->ref_count[0])
2745             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2746     }
2747 #ifdef TRACE
2748     for (i=0; i<h->ref_count[0]; i++) {
2749         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2750     }
2751     if(h->slice_type_nos==FF_B_TYPE){
2752         for (i=0; i<h->ref_count[1]; i++) {
2753             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2754         }
2755     }
2756 #endif
2757     return 0;
2758 }
2759
2760 static void print_short_term(H264Context *h);
2761 static void print_long_term(H264Context *h);
2762
2763 /**
2764  * Extract structure information about the picture described by pic_num in
2765  * the current decoding context (frame or field). Note that pic_num is
2766  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2767  * @param pic_num picture number for which to extract structure information
2768  * @param structure one of PICT_XXX describing structure of picture
2769  *                      with pic_num
2770  * @return frame number (short term) or long term index of picture
2771  *         described by pic_num
2772  */
2773 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2774     MpegEncContext * const s = &h->s;
2775
2776     *structure = s->picture_structure;
2777     if(FIELD_PICTURE){
2778         if (!(pic_num & 1))
2779             /* opposite field */
2780             *structure ^= PICT_FRAME;
2781         pic_num >>= 1;
2782     }
2783
2784     return pic_num;
2785 }
2786
2787 static int decode_ref_pic_list_reordering(H264Context *h){
2788     MpegEncContext * const s = &h->s;
2789     int list, index, pic_structure;
2790
2791     print_short_term(h);
2792     print_long_term(h);
2793
2794     for(list=0; list<h->list_count; list++){
2795         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2796
2797         if(get_bits1(&s->gb)){
2798             int pred= h->curr_pic_num;
2799
2800             for(index=0; ; index++){
2801                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2802                 unsigned int pic_id;
2803                 int i;
2804                 Picture *ref = NULL;
2805
2806                 if(reordering_of_pic_nums_idc==3)
2807                     break;
2808
2809                 if(index >= h->ref_count[list]){
2810                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2811                     return -1;
2812                 }
2813
2814                 if(reordering_of_pic_nums_idc<3){
2815                     if(reordering_of_pic_nums_idc<2){
2816                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2817                         int frame_num;
2818
2819                         if(abs_diff_pic_num > h->max_pic_num){
2820                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2821                             return -1;
2822                         }
2823
2824                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2825                         else                                pred+= abs_diff_pic_num;
2826                         pred &= h->max_pic_num - 1;
2827
2828                         frame_num = pic_num_extract(h, pred, &pic_structure);
2829
2830                         for(i= h->short_ref_count-1; i>=0; i--){
2831                             ref = h->short_ref[i];
2832                             assert(ref->reference);
2833                             assert(!ref->long_ref);
2834                             if(
2835                                    ref->frame_num == frame_num &&
2836                                    (ref->reference & pic_structure)
2837                               )
2838                                 break;
2839                         }
2840                         if(i>=0)
2841                             ref->pic_id= pred;
2842                     }else{
2843                         int long_idx;
2844                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2845
2846                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2847
2848                         if(long_idx>31){
2849                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2850                             return -1;
2851                         }
2852                         ref = h->long_ref[long_idx];
2853                         assert(!(ref && !ref->reference));
2854                         if(ref && (ref->reference & pic_structure)){
2855                             ref->pic_id= pic_id;
2856                             assert(ref->long_ref);
2857                             i=0;
2858                         }else{
2859                             i=-1;
2860                         }
2861                     }
2862
2863                     if (i < 0) {
2864                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2865                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2866                     } else {
2867                         for(i=index; i+1<h->ref_count[list]; i++){
2868                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2869                                 break;
2870                         }
2871                         for(; i > index; i--){
2872                             h->ref_list[list][i]= h->ref_list[list][i-1];
2873                         }
2874                         h->ref_list[list][index]= *ref;
2875                         if (FIELD_PICTURE){
2876                             pic_as_field(&h->ref_list[list][index], pic_structure);
2877                         }
2878                     }
2879                 }else{
2880                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2881                     return -1;
2882                 }
2883             }
2884         }
2885     }
2886     for(list=0; list<h->list_count; list++){
2887         for(index= 0; index < h->ref_count[list]; index++){
2888             if(!h->ref_list[list][index].data[0]){
2889                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2890                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2891             }
2892         }
2893     }
2894
2895     return 0;
2896 }
2897
2898 static void fill_mbaff_ref_list(H264Context *h){
2899     int list, i, j;
2900     for(list=0; list<2; list++){ //FIXME try list_count
2901         for(i=0; i<h->ref_count[list]; i++){
2902             Picture *frame = &h->ref_list[list][i];
2903             Picture *field = &h->ref_list[list][16+2*i];
2904             field[0] = *frame;
2905             for(j=0; j<3; j++)
2906                 field[0].linesize[j] <<= 1;
2907             field[0].reference = PICT_TOP_FIELD;
2908             field[0].poc= field[0].field_poc[0];
2909             field[1] = field[0];
2910             for(j=0; j<3; j++)
2911                 field[1].data[j] += frame->linesize[j];
2912             field[1].reference = PICT_BOTTOM_FIELD;
2913             field[1].poc= field[1].field_poc[1];
2914
2915             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2916             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2917             for(j=0; j<2; j++){
2918                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2919                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2920             }
2921         }
2922     }
2923     for(j=0; j<h->ref_count[1]; j++){
2924         for(i=0; i<h->ref_count[0]; i++)
2925             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2926         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2927         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2928     }
2929 }
2930
2931 static int pred_weight_table(H264Context *h){
2932     MpegEncContext * const s = &h->s;
2933     int list, i;
2934     int luma_def, chroma_def;
2935
2936     h->use_weight= 0;
2937     h->use_weight_chroma= 0;
2938     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2939     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2940     luma_def = 1<<h->luma_log2_weight_denom;
2941     chroma_def = 1<<h->chroma_log2_weight_denom;
2942
2943     for(list=0; list<2; list++){
2944         for(i=0; i<h->ref_count[list]; i++){
2945             int luma_weight_flag, chroma_weight_flag;
2946
2947             luma_weight_flag= get_bits1(&s->gb);
2948             if(luma_weight_flag){
2949                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2950                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2951                 if(   h->luma_weight[list][i] != luma_def
2952                    || h->luma_offset[list][i] != 0)
2953                     h->use_weight= 1;
2954             }else{
2955                 h->luma_weight[list][i]= luma_def;
2956                 h->luma_offset[list][i]= 0;
2957             }
2958
2959             if(CHROMA){
2960                 chroma_weight_flag= get_bits1(&s->gb);
2961                 if(chroma_weight_flag){
2962                     int j;
2963                     for(j=0; j<2; j++){
2964                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2965                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2966                         if(   h->chroma_weight[list][i][j] != chroma_def
2967                         || h->chroma_offset[list][i][j] != 0)
2968                             h->use_weight_chroma= 1;
2969                     }
2970                 }else{
2971                     int j;
2972                     for(j=0; j<2; j++){
2973                         h->chroma_weight[list][i][j]= chroma_def;
2974                         h->chroma_offset[list][i][j]= 0;
2975                     }
2976                 }
2977             }
2978         }
2979         if(h->slice_type_nos != FF_B_TYPE) break;
2980     }
2981     h->use_weight= h->use_weight || h->use_weight_chroma;
2982     return 0;
2983 }
2984
2985 static void implicit_weight_table(H264Context *h){
2986     MpegEncContext * const s = &h->s;
2987     int ref0, ref1;
2988     int cur_poc = s->current_picture_ptr->poc;
2989
2990     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2991        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2992         h->use_weight= 0;
2993         h->use_weight_chroma= 0;
2994         return;
2995     }
2996
2997     h->use_weight= 2;
2998     h->use_weight_chroma= 2;
2999     h->luma_log2_weight_denom= 5;
3000     h->chroma_log2_weight_denom= 5;
3001
3002     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3003         int poc0 = h->ref_list[0][ref0].poc;
3004         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3005             int poc1 = h->ref_list[1][ref1].poc;
3006             int td = av_clip(poc1 - poc0, -128, 127);
3007             if(td){
3008                 int tb = av_clip(cur_poc - poc0, -128, 127);
3009                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3010                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3011                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3012                     h->implicit_weight[ref0][ref1] = 32;
3013                 else
3014                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3015             }else
3016                 h->implicit_weight[ref0][ref1] = 32;
3017         }
3018     }
3019 }
3020
3021 /**
3022  * Mark a picture as no longer needed for reference. The refmask
3023  * argument allows unreferencing of individual fields or the whole frame.
3024  * If the picture becomes entirely unreferenced, but is being held for
3025  * display purposes, it is marked as such.
3026  * @param refmask mask of fields to unreference; the mask is bitwise
3027  *                anded with the reference marking of pic
3028  * @return non-zero if pic becomes entirely unreferenced (except possibly
3029  *         for display purposes) zero if one of the fields remains in
3030  *         reference
3031  */
3032 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3033     int i;
3034     if (pic->reference &= refmask) {
3035         return 0;
3036     } else {
3037         for(i = 0; h->delayed_pic[i]; i++)
3038             if(pic == h->delayed_pic[i]){
3039                 pic->reference=DELAYED_PIC_REF;
3040                 break;
3041             }
3042         return 1;
3043     }
3044 }
3045
3046 /**
3047  * instantaneous decoder refresh.
3048  */
3049 static void idr(H264Context *h){
3050     int i;
3051
3052     for(i=0; i<16; i++){
3053         remove_long(h, i, 0);
3054     }
3055     assert(h->long_ref_count==0);
3056
3057     for(i=0; i<h->short_ref_count; i++){
3058         unreference_pic(h, h->short_ref[i], 0);
3059         h->short_ref[i]= NULL;
3060     }
3061     h->short_ref_count=0;
3062     h->prev_frame_num= 0;
3063     h->prev_frame_num_offset= 0;
3064     h->prev_poc_msb=
3065     h->prev_poc_lsb= 0;
3066 }
3067
3068 /* forget old pics after a seek */
3069 static void flush_dpb(AVCodecContext *avctx){
3070     H264Context *h= avctx->priv_data;
3071     int i;
3072     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3073         if(h->delayed_pic[i])
3074             h->delayed_pic[i]->reference= 0;
3075         h->delayed_pic[i]= NULL;
3076     }
3077     h->outputed_poc= INT_MIN;
3078     idr(h);
3079     if(h->s.current_picture_ptr)
3080         h->s.current_picture_ptr->reference= 0;
3081     h->s.first_field= 0;
3082     ff_mpeg_flush(avctx);
3083 }
3084
3085 /**
3086  * Find a Picture in the short term reference list by frame number.
3087  * @param frame_num frame number to search for
3088  * @param idx the index into h->short_ref where returned picture is found
3089  *            undefined if no picture found.
3090  * @return pointer to the found picture, or NULL if no pic with the provided
3091  *                 frame number is found
3092  */
3093 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3094     MpegEncContext * const s = &h->s;
3095     int i;
3096
3097     for(i=0; i<h->short_ref_count; i++){
3098         Picture *pic= h->short_ref[i];
3099         if(s->avctx->debug&FF_DEBUG_MMCO)
3100             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3101         if(pic->frame_num == frame_num) {
3102             *idx = i;
3103             return pic;
3104         }
3105     }
3106     return NULL;
3107 }
3108
3109 /**
3110  * Remove a picture from the short term reference list by its index in
3111  * that list.  This does no checking on the provided index; it is assumed
3112  * to be valid. Other list entries are shifted down.
3113  * @param i index into h->short_ref of picture to remove.
3114  */
3115 static void remove_short_at_index(H264Context *h, int i){
3116     assert(i >= 0 && i < h->short_ref_count);
3117     h->short_ref[i]= NULL;
3118     if (--h->short_ref_count)
3119         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3120 }
3121
3122 /**
3123  *
3124  * @return the removed picture or NULL if an error occurs
3125  */
3126 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3127     MpegEncContext * const s = &h->s;
3128     Picture *pic;
3129     int i;
3130
3131     if(s->avctx->debug&FF_DEBUG_MMCO)
3132         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3133
3134     pic = find_short(h, frame_num, &i);
3135     if (pic){
3136         if(unreference_pic(h, pic, ref_mask))
3137         remove_short_at_index(h, i);
3138     }
3139
3140     return pic;
3141 }
3142
3143 /**
3144  * Remove a picture from the long term reference list by its index in
3145  * that list.
3146  * @return the removed picture or NULL if an error occurs
3147  */
3148 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3149     Picture *pic;
3150
3151     pic= h->long_ref[i];
3152     if (pic){
3153         if(unreference_pic(h, pic, ref_mask)){
3154             assert(h->long_ref[i]->long_ref == 1);
3155             h->long_ref[i]->long_ref= 0;
3156             h->long_ref[i]= NULL;
3157             h->long_ref_count--;
3158         }
3159     }
3160
3161     return pic;
3162 }
3163
3164 /**
3165  * print short term list
3166  */
3167 static void print_short_term(H264Context *h) {
3168     uint32_t i;
3169     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3170         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3171         for(i=0; i<h->short_ref_count; i++){
3172             Picture *pic= h->short_ref[i];
3173             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3174         }
3175     }
3176 }
3177
3178 /**
3179  * print long term list
3180  */
3181 static void print_long_term(H264Context *h) {
3182     uint32_t i;
3183     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3184         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3185         for(i = 0; i < 16; i++){
3186             Picture *pic= h->long_ref[i];
3187             if (pic) {
3188                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3189             }
3190         }
3191     }
3192 }
3193
3194 /**
3195  * Executes the reference picture marking (memory management control operations).
3196  */
3197 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3198     MpegEncContext * const s = &h->s;
3199     int i, j;
3200     int current_ref_assigned=0;
3201     Picture *pic;
3202
3203     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3204         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3205
3206     for(i=0; i<mmco_count; i++){
3207         int structure, frame_num;
3208         if(s->avctx->debug&FF_DEBUG_MMCO)
3209             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3210
3211         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3212            || mmco[i].opcode == MMCO_SHORT2LONG){
3213             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3214             pic = find_short(h, frame_num, &j);
3215             if(!pic){
3216                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3217                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3218                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3219                 continue;
3220             }
3221         }
3222
3223         switch(mmco[i].opcode){
3224         case MMCO_SHORT2UNUSED:
3225             if(s->avctx->debug&FF_DEBUG_MMCO)
3226                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3227             remove_short(h, frame_num, structure ^ PICT_FRAME);
3228             break;
3229         case MMCO_SHORT2LONG:
3230                 if (h->long_ref[mmco[i].long_arg] != pic)
3231                     remove_long(h, mmco[i].long_arg, 0);
3232
3233                 remove_short_at_index(h, j);
3234                 h->long_ref[ mmco[i].long_arg ]= pic;
3235                 if (h->long_ref[ mmco[i].long_arg ]){
3236                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3237                     h->long_ref_count++;
3238                 }
3239             break;
3240         case MMCO_LONG2UNUSED:
3241             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3242             pic = h->long_ref[j];
3243             if (pic) {
3244                 remove_long(h, j, structure ^ PICT_FRAME);
3245             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3246                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3247             break;
3248         case MMCO_LONG:
3249                     // Comment below left from previous code as it is an interresting note.
3250                     /* First field in pair is in short term list or
3251                      * at a different long term index.
3252                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3253                      * Report the problem and keep the pair where it is,
3254                      * and mark this field valid.
3255                      */
3256
3257             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3258                 remove_long(h, mmco[i].long_arg, 0);
3259
3260                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3261                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3262                 h->long_ref_count++;
3263             }
3264
3265             s->current_picture_ptr->reference |= s->picture_structure;
3266             current_ref_assigned=1;
3267             break;
3268         case MMCO_SET_MAX_LONG:
3269             assert(mmco[i].long_arg <= 16);
3270             // just remove the long term which index is greater than new max
3271             for(j = mmco[i].long_arg; j<16; j++){
3272                 remove_long(h, j, 0);
3273             }
3274             break;
3275         case MMCO_RESET:
3276             while(h->short_ref_count){
3277                 remove_short(h, h->short_ref[0]->frame_num, 0);
3278             }
3279             for(j = 0; j < 16; j++) {
3280                 remove_long(h, j, 0);
3281             }
3282             s->current_picture_ptr->poc=
3283             s->current_picture_ptr->field_poc[0]=
3284             s->current_picture_ptr->field_poc[1]=
3285             h->poc_lsb=
3286             h->poc_msb=
3287             h->frame_num=
3288             s->current_picture_ptr->frame_num= 0;
3289             break;
3290         default: assert(0);
3291         }
3292     }
3293
3294     if (!current_ref_assigned) {
3295         /* Second field of complementary field pair; the first field of
3296          * which is already referenced. If short referenced, it
3297          * should be first entry in short_ref. If not, it must exist
3298          * in long_ref; trying to put it on the short list here is an
3299          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3300          */
3301         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3302             /* Just mark the second field valid */
3303             s->current_picture_ptr->reference = PICT_FRAME;
3304         } else if (s->current_picture_ptr->long_ref) {
3305             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3306                                              "assignment for second field "
3307                                              "in complementary field pair "
3308                                              "(first field is long term)\n");
3309         } else {
3310             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3311             if(pic){
3312                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3313             }
3314
3315             if(h->short_ref_count)
3316                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3317
3318             h->short_ref[0]= s->current_picture_ptr;
3319             h->short_ref_count++;
3320             s->current_picture_ptr->reference |= s->picture_structure;
3321         }
3322     }
3323
3324     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3325
3326         /* We have too many reference frames, probably due to corrupted
3327          * stream. Need to discard one frame. Prevents overrun of the
3328          * short_ref and long_ref buffers.
3329          */
3330         av_log(h->s.avctx, AV_LOG_ERROR,
3331                "number of reference frames exceeds max (probably "
3332                "corrupt input), discarding one\n");
3333
3334         if (h->long_ref_count && !h->short_ref_count) {
3335             for (i = 0; i < 16; ++i)
3336                 if (h->long_ref[i])
3337                     break;
3338
3339             assert(i < 16);
3340             remove_long(h, i, 0);
3341         } else {
3342             pic = h->short_ref[h->short_ref_count - 1];
3343             remove_short(h, pic->frame_num, 0);
3344         }
3345     }
3346
3347     print_short_term(h);
3348     print_long_term(h);
3349     return 0;
3350 }
3351
3352 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3353     MpegEncContext * const s = &h->s;
3354     int i;
3355
3356     h->mmco_index= 0;
3357     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3358         s->broken_link= get_bits1(gb) -1;
3359         if(get_bits1(gb)){
3360             h->mmco[0].opcode= MMCO_LONG;
3361             h->mmco[0].long_arg= 0;
3362             h->mmco_index= 1;
3363         }
3364     }else{
3365         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3366             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3367                 MMCOOpcode opcode= get_ue_golomb(gb);
3368
3369                 h->mmco[i].opcode= opcode;
3370                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3371                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3372 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3373                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3374                         return -1;
3375                     }*/
3376                 }
3377                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3378                     unsigned int long_arg= get_ue_golomb(gb);
3379                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3380                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3381                         return -1;
3382                     }
3383                     h->mmco[i].long_arg= long_arg;
3384                 }
3385
3386                 if(opcode > (unsigned)MMCO_LONG){
3387                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3388                     return -1;
3389                 }
3390                 if(opcode == MMCO_END)
3391                     break;
3392             }
3393             h->mmco_index= i;
3394         }else{
3395             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3396
3397             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3398                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3399                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3400                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3401                 h->mmco_index= 1;
3402                 if (FIELD_PICTURE) {
3403                     h->mmco[0].short_pic_num *= 2;
3404                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3405                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3406                     h->mmco_index= 2;
3407                 }
3408             }
3409         }
3410     }
3411
3412     return 0;
3413 }
3414
3415 static int init_poc(H264Context *h){
3416     MpegEncContext * const s = &h->s;
3417     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3418     int field_poc[2];
3419     Picture *cur = s->current_picture_ptr;
3420
3421     h->frame_num_offset= h->prev_frame_num_offset;
3422     if(h->frame_num < h->prev_frame_num)
3423         h->frame_num_offset += max_frame_num;
3424
3425     if(h->sps.poc_type==0){
3426         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3427
3428         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3429             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3430         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3431             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3432         else
3433             h->poc_msb = h->prev_poc_msb;
3434 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3435         field_poc[0] =
3436         field_poc[1] = h->poc_msb + h->poc_lsb;
3437         if(s->picture_structure == PICT_FRAME)
3438             field_poc[1] += h->delta_poc_bottom;
3439     }else if(h->sps.poc_type==1){
3440         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3441         int i;
3442
3443         if(h->sps.poc_cycle_length != 0)
3444             abs_frame_num = h->frame_num_offset + h->frame_num;
3445         else
3446             abs_frame_num = 0;
3447
3448         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3449             abs_frame_num--;
3450
3451         expected_delta_per_poc_cycle = 0;
3452         for(i=0; i < h->sps.poc_cycle_length; i++)
3453             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3454
3455         if(abs_frame_num > 0){
3456             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3457             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3458
3459             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3460             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3461                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3462         } else
3463             expectedpoc = 0;
3464
3465         if(h->nal_ref_idc == 0)
3466             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3467
3468         field_poc[0] = expectedpoc + h->delta_poc[0];
3469         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3470
3471         if(s->picture_structure == PICT_FRAME)
3472             field_poc[1] += h->delta_poc[1];
3473     }else{
3474         int poc= 2*(h->frame_num_offset + h->frame_num);
3475
3476         if(!h->nal_ref_idc)
3477             poc--;
3478
3479         field_poc[0]= poc;
3480         field_poc[1]= poc;
3481     }
3482
3483     if(s->picture_structure != PICT_BOTTOM_FIELD)
3484         s->current_picture_ptr->field_poc[0]= field_poc[0];
3485     if(s->picture_structure != PICT_TOP_FIELD)
3486         s->current_picture_ptr->field_poc[1]= field_poc[1];
3487     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3488
3489     return 0;
3490 }
3491
3492
3493 /**
3494  * initialize scan tables
3495  */
3496 static void init_scan_tables(H264Context *h){
3497     MpegEncContext * const s = &h->s;
3498     int i;
3499     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3500         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3501         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3502     }else{
3503         for(i=0; i<16; i++){
3504 #define T(x) (x>>2) | ((x<<2) & 0xF)
3505             h->zigzag_scan[i] = T(zigzag_scan[i]);
3506             h-> field_scan[i] = T( field_scan[i]);
3507 #undef T
3508         }
3509     }
3510     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3511         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3512         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3513         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3514         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3515     }else{
3516         for(i=0; i<64; i++){
3517 #define T(x) (x>>3) | ((x&7)<<3)
3518             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3519             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3520             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3521             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3522 #undef T
3523         }
3524     }
3525     if(h->sps.transform_bypass){ //FIXME same ugly
3526         h->zigzag_scan_q0          = zigzag_scan;
3527         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3528         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3529         h->field_scan_q0           = field_scan;
3530         h->field_scan8x8_q0        = field_scan8x8;
3531         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3532     }else{
3533         h->zigzag_scan_q0          = h->zigzag_scan;
3534         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3535         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3536         h->field_scan_q0           = h->field_scan;
3537         h->field_scan8x8_q0        = h->field_scan8x8;
3538         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3539     }
3540 }
3541
3542 /**
3543  * Replicates H264 "master" context to thread contexts.
3544  */
3545 static void clone_slice(H264Context *dst, H264Context *src)
3546 {
3547     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3548     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3549     dst->s.current_picture      = src->s.current_picture;
3550     dst->s.linesize             = src->s.linesize;
3551     dst->s.uvlinesize           = src->s.uvlinesize;
3552     dst->s.first_field          = src->s.first_field;
3553
3554     dst->prev_poc_msb           = src->prev_poc_msb;
3555     dst->prev_poc_lsb           = src->prev_poc_lsb;
3556     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3557     dst->prev_frame_num         = src->prev_frame_num;
3558     dst->short_ref_count        = src->short_ref_count;
3559
3560     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3561     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3562     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3563     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3564
3565     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3566     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3567 }
3568
3569 /**
3570  * decodes a slice header.
3571  * This will also call MPV_common_init() and frame_start() as needed.
3572  *
3573  * @param h h264context
3574  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3575  *
3576  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3577  */
3578 static int decode_slice_header(H264Context *h, H264Context *h0){
3579     MpegEncContext * const s = &h->s;
3580     MpegEncContext * const s0 = &h0->s;
3581     unsigned int first_mb_in_slice;
3582     unsigned int pps_id;
3583     int num_ref_idx_active_override_flag;
3584     unsigned int slice_type, tmp, i, j;
3585     int default_ref_list_done = 0;
3586     int last_pic_structure;
3587
3588     s->dropable= h->nal_ref_idc == 0;
3589
3590     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3591         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3592         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3593     }else{
3594         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3595         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3596     }
3597
3598     first_mb_in_slice= get_ue_golomb(&s->gb);
3599
3600     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3601         h0->current_slice = 0;
3602         if (!s0->first_field)
3603             s->current_picture_ptr= NULL;
3604     }
3605
3606     slice_type= get_ue_golomb(&s->gb);
3607     if(slice_type > 9){
3608         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3609         return -1;
3610     }
3611     if(slice_type > 4){
3612         slice_type -= 5;
3613         h->slice_type_fixed=1;
3614     }else
3615         h->slice_type_fixed=0;
3616
3617     slice_type= golomb_to_pict_type[ slice_type ];
3618     if (slice_type == FF_I_TYPE
3619         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3620         default_ref_list_done = 1;
3621     }
3622     h->slice_type= slice_type;
3623     h->slice_type_nos= slice_type & 3;
3624
3625     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3626     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3627         av_log(h->s.avctx, AV_LOG_ERROR,
3628                "B picture before any references, skipping\n");
3629         return -1;
3630     }
3631
3632     pps_id= get_ue_golomb(&s->gb);
3633     if(pps_id>=MAX_PPS_COUNT){
3634         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3635         return -1;
3636     }
3637     if(!h0->pps_buffers[pps_id]) {
3638         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3639         return -1;
3640     }
3641     h->pps= *h0->pps_buffers[pps_id];
3642
3643     if(!h0->sps_buffers[h->pps.sps_id]) {
3644         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3645         return -1;
3646     }
3647     h->sps = *h0->sps_buffers[h->pps.sps_id];
3648
3649     if(h == h0 && h->dequant_coeff_pps != pps_id){
3650         h->dequant_coeff_pps = pps_id;
3651         init_dequant_tables(h);
3652     }
3653
3654     s->mb_width= h->sps.mb_width;
3655     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3656
3657     h->b_stride=  s->mb_width*4;
3658     h->b8_stride= s->mb_width*2;
3659
3660     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3661     if(h->sps.frame_mbs_only_flag)
3662         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3663     else
3664         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3665
3666     if (s->context_initialized
3667         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3668         if(h != h0)
3669             return -1;   // width / height changed during parallelized decoding
3670         free_tables(h);
3671         flush_dpb(s->avctx);
3672         MPV_common_end(s);
3673     }
3674     if (!s->context_initialized) {
3675         if(h != h0)
3676             return -1;  // we cant (re-)initialize context during parallel decoding
3677         if (MPV_common_init(s) < 0)
3678             return -1;
3679         s->first_field = 0;
3680
3681         init_scan_tables(h);
3682         alloc_tables(h);
3683
3684         for(i = 1; i < s->avctx->thread_count; i++) {
3685             H264Context *c;
3686             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3687             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3688             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3689             c->sps = h->sps;
3690             c->pps = h->pps;
3691             init_scan_tables(c);
3692             clone_tables(c, h);
3693         }
3694
3695         for(i = 0; i < s->avctx->thread_count; i++)
3696             if(context_init(h->thread_context[i]) < 0)
3697                 return -1;
3698
3699         s->avctx->width = s->width;
3700         s->avctx->height = s->height;
3701         s->avctx->sample_aspect_ratio= h->sps.sar;
3702         if(!s->avctx->sample_aspect_ratio.den)
3703             s->avctx->sample_aspect_ratio.den = 1;
3704
3705         if(h->sps.timing_info_present_flag){
3706             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3707             if(h->x264_build > 0 && h->x264_build < 44)
3708                 s->avctx->time_base.den *= 2;
3709             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3710                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3711         }
3712     }
3713
3714     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3715
3716     h->mb_mbaff = 0;
3717     h->mb_aff_frame = 0;
3718     last_pic_structure = s0->picture_structure;
3719     if(h->sps.frame_mbs_only_flag){
3720         s->picture_structure= PICT_FRAME;
3721     }else{
3722         if(get_bits1(&s->gb)) { //field_pic_flag
3723             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3724         } else {
3725             s->picture_structure= PICT_FRAME;
3726             h->mb_aff_frame = h->sps.mb_aff;
3727         }
3728     }
3729     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3730
3731     if(h0->current_slice == 0){
3732         while(h->frame_num !=  h->prev_frame_num &&
3733               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3734             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3735             frame_start(h);
3736             h->prev_frame_num++;
3737             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3738             s->current_picture_ptr->frame_num= h->prev_frame_num;
3739             execute_ref_pic_marking(h, NULL, 0);
3740         }
3741
3742         /* See if we have a decoded first field looking for a pair... */
3743         if (s0->first_field) {
3744             assert(s0->current_picture_ptr);
3745             assert(s0->current_picture_ptr->data[0]);
3746             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3747
3748             /* figure out if we have a complementary field pair */
3749             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3750                 /*
3751                  * Previous field is unmatched. Don't display it, but let it
3752                  * remain for reference if marked as such.
3753                  */
3754                 s0->current_picture_ptr = NULL;
3755                 s0->first_field = FIELD_PICTURE;
3756
3757             } else {
3758                 if (h->nal_ref_idc &&
3759                         s0->current_picture_ptr->reference &&
3760                         s0->current_picture_ptr->frame_num != h->frame_num) {
3761                     /*
3762                      * This and previous field were reference, but had
3763                      * different frame_nums. Consider this field first in
3764                      * pair. Throw away previous field except for reference
3765                      * purposes.
3766                      */
3767                     s0->first_field = 1;
3768                     s0->current_picture_ptr = NULL;
3769
3770                 } else {
3771                     /* Second field in complementary pair */
3772                     s0->first_field = 0;
3773                 }
3774             }
3775
3776         } else {
3777             /* Frame or first field in a potentially complementary pair */
3778             assert(!s0->current_picture_ptr);
3779             s0->first_field = FIELD_PICTURE;
3780         }
3781
3782         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3783             s0->first_field = 0;
3784             return -1;
3785         }
3786     }
3787     if(h != h0)
3788         clone_slice(h, h0);
3789
3790     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3791
3792     assert(s->mb_num == s->mb_width * s->mb_height);
3793     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3794        first_mb_in_slice                    >= s->mb_num){
3795         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3796         return -1;
3797     }
3798     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3799     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3800     if (s->picture_structure == PICT_BOTTOM_FIELD)
3801         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3802     assert(s->mb_y < s->mb_height);
3803
3804     if(s->picture_structure==PICT_FRAME){
3805         h->curr_pic_num=   h->frame_num;
3806         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3807     }else{
3808         h->curr_pic_num= 2*h->frame_num + 1;
3809         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3810     }
3811
3812     if(h->nal_unit_type == NAL_IDR_SLICE){
3813         get_ue_golomb(&s->gb); /* idr_pic_id */
3814     }
3815
3816     if(h->sps.poc_type==0){
3817         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3818
3819         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3820             h->delta_poc_bottom= get_se_golomb(&s->gb);
3821         }
3822     }
3823
3824     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3825         h->delta_poc[0]= get_se_golomb(&s->gb);
3826
3827         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3828             h->delta_poc[1]= get_se_golomb(&s->gb);
3829     }
3830
3831     init_poc(h);
3832
3833     if(h->pps.redundant_pic_cnt_present){
3834         h->redundant_pic_count= get_ue_golomb(&s->gb);
3835     }
3836
3837     //set defaults, might be overridden a few lines later
3838     h->ref_count[0]= h->pps.ref_count[0];
3839     h->ref_count[1]= h->pps.ref_count[1];
3840
3841     if(h->slice_type_nos != FF_I_TYPE){
3842         if(h->slice_type_nos == FF_B_TYPE){
3843             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3844         }
3845         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3846
3847         if(num_ref_idx_active_override_flag){
3848             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3849             if(h->slice_type_nos==FF_B_TYPE)
3850                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3851
3852             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3853                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3854                 h->ref_count[0]= h->ref_count[1]= 1;
3855                 return -1;
3856             }
3857         }
3858         if(h->slice_type_nos == FF_B_TYPE)
3859             h->list_count= 2;
3860         else
3861             h->list_count= 1;
3862     }else
3863         h->list_count= 0;
3864
3865     if(!default_ref_list_done){
3866         fill_default_ref_list(h);
3867     }
3868
3869     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3870         return -1;
3871
3872     if(h->slice_type_nos!=FF_I_TYPE){
3873         s->last_picture_ptr= &h->ref_list[0][0];
3874         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3875     }
3876     if(h->slice_type_nos==FF_B_TYPE){
3877         s->next_picture_ptr= &h->ref_list[1][0];
3878         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3879     }
3880
3881     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3882        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3883         pred_weight_table(h);
3884     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3885         implicit_weight_table(h);
3886     else
3887         h->use_weight = 0;
3888
3889     if(h->nal_ref_idc)
3890         decode_ref_pic_marking(h0, &s->gb);
3891
3892     if(FRAME_MBAFF)
3893         fill_mbaff_ref_list(h);
3894
3895     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3896         direct_dist_scale_factor(h);
3897     direct_ref_list_init(h);
3898
3899     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3900         tmp = get_ue_golomb(&s->gb);
3901         if(tmp > 2){
3902             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3903             return -1;
3904         }
3905         h->cabac_init_idc= tmp;
3906     }
3907
3908     h->last_qscale_diff = 0;
3909     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3910     if(tmp>51){
3911         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3912         return -1;
3913     }
3914     s->qscale= tmp;
3915     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3916     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3917     //FIXME qscale / qp ... stuff
3918     if(h->slice_type == FF_SP_TYPE){
3919         get_bits1(&s->gb); /* sp_for_switch_flag */
3920     }
3921     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3922         get_se_golomb(&s->gb); /* slice_qs_delta */
3923     }
3924
3925     h->deblocking_filter = 1;
3926     h->slice_alpha_c0_offset = 0;
3927     h->slice_beta_offset = 0;
3928     if( h->pps.deblocking_filter_parameters_present ) {
3929         tmp= get_ue_golomb(&s->gb);
3930         if(tmp > 2){
3931             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3932             return -1;
3933         }
3934         h->deblocking_filter= tmp;
3935         if(h->deblocking_filter < 2)
3936             h->deblocking_filter^= 1; // 1<->0
3937
3938         if( h->deblocking_filter ) {
3939             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3940             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3941         }
3942     }
3943
3944     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3945        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3946        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3947        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3948         h->deblocking_filter= 0;
3949
3950     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3951         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3952             /* Cheat slightly for speed:
3953                Do not bother to deblock across slices. */
3954             h->deblocking_filter = 2;
3955         } else {
3956             h0->max_contexts = 1;
3957             if(!h0->single_decode_warning) {
3958                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3959                 h0->single_decode_warning = 1;
3960             }
3961             if(h != h0)
3962                 return 1; // deblocking switched inside frame
3963         }
3964     }
3965
3966 #if 0 //FMO
3967     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3968         slice_group_change_cycle= get_bits(&s->gb, ?);
3969 #endif
3970
3971     h0->last_slice_type = slice_type;
3972     h->slice_num = ++h0->current_slice;
3973     if(h->slice_num >= MAX_SLICES){
3974         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3975     }
3976
3977     for(j=0; j<2; j++){
3978         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3979         ref2frm[0]=
3980         ref2frm[1]= -1;
3981         for(i=0; i<16; i++)
3982             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3983                           +(h->ref_list[j][i].reference&3);
3984         ref2frm[18+0]=
3985         ref2frm[18+1]= -1;
3986         for(i=16; i<48; i++)
3987             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3988                           +(h->ref_list[j][i].reference&3);
3989     }
3990
3991     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3992     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3993
3994     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3995         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
3996                h->slice_num,
3997                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3998                first_mb_in_slice,
3999                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4000                pps_id, h->frame_num,
4001                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4002                h->ref_count[0], h->ref_count[1],
4003                s->qscale,
4004                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4005                h->use_weight,
4006                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4007                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4008                );
4009     }
4010
4011     return 0;
4012 }
4013
4014 /**
4015  *
4016  */
4017 static inline int get_level_prefix(GetBitContext *gb){
4018     unsigned int buf;
4019     int log;
4020
4021     OPEN_READER(re, gb);
4022     UPDATE_CACHE(re, gb);
4023     buf=GET_CACHE(re, gb);
4024
4025     log= 32 - av_log2(buf);
4026 #ifdef TRACE
4027     print_bin(buf>>(32-log), log);
4028     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4029 #endif
4030
4031     LAST_SKIP_BITS(re, gb, log);
4032     CLOSE_READER(re, gb);
4033
4034     return log-1;
4035 }
4036
4037 static inline int get_dct8x8_allowed(H264Context *h){
4038     int i;
4039     for(i=0; i<4; i++){
4040         if(!IS_SUB_8X8(h->sub_mb_type[i])
4041            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4042             return 0;
4043     }
4044     return 1;
4045 }
4046
4047 /**
4048  * decodes a residual block.
4049  * @param n block index
4050  * @param scantable scantable
4051  * @param max_coeff number of coefficients in the block
4052  * @return <0 if an error occurred
4053  */
4054 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4055     MpegEncContext * const s = &h->s;
4056     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4057     int level[16];
4058     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4059
4060     //FIXME put trailing_onex into the context
4061
4062     if(n == CHROMA_DC_BLOCK_INDEX){
4063         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4064         total_coeff= coeff_token>>2;
4065     }else{
4066         if(n == LUMA_DC_BLOCK_INDEX){
4067             total_coeff= pred_non_zero_count(h, 0);
4068             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4069             total_coeff= coeff_token>>2;
4070         }else{
4071             total_coeff= pred_non_zero_count(h, n);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4075         }
4076     }
4077
4078     //FIXME set last_non_zero?
4079
4080     if(total_coeff==0)
4081         return 0;
4082     if(total_coeff > (unsigned)max_coeff) {
4083         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4084         return -1;
4085     }
4086
4087     trailing_ones= coeff_token&3;
4088     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4089     assert(total_coeff<=16);
4090
4091     i = show_bits(gb, 3);
4092     skip_bits(gb, trailing_ones);
4093     level[0] = 1-((i&4)>>1);
4094     level[1] = 1-((i&2)   );
4095     level[2] = 1-((i&1)<<1);
4096
4097     if(trailing_ones<total_coeff) {
4098         int level_code, mask;
4099         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4100         int prefix= get_level_prefix(gb);
4101
4102         //first coefficient has suffix_length equal to 0 or 1
4103         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4104             if(suffix_length)
4105                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4106             else
4107                 level_code= (prefix<<suffix_length); //part
4108         }else if(prefix==14){
4109             if(suffix_length)
4110                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4111             else
4112                 level_code= prefix + get_bits(gb, 4); //part
4113         }else{
4114             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4115             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4116             if(prefix>=16)
4117                 level_code += (1<<(prefix-3))-4096;
4118         }
4119
4120         if(trailing_ones < 3) level_code += 2;
4121
4122         suffix_length = 1;
4123         if(level_code > 5)
4124             suffix_length++;
4125         mask= -(level_code&1);
4126         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4127
4128         //remaining coefficients have suffix_length > 0
4129         for(i=trailing_ones+1;i<total_coeff;i++) {
4130             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4131             prefix = get_level_prefix(gb);
4132             if(prefix<15){
4133                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4134             }else{
4135                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4136                 if(prefix>=16)
4137                     level_code += (1<<(prefix-3))-4096;
4138             }
4139             mask= -(level_code&1);
4140             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4141             if(level_code > suffix_limit[suffix_length])
4142                 suffix_length++;
4143         }
4144     }
4145
4146     if(total_coeff == max_coeff)
4147         zeros_left=0;
4148     else{
4149         if(n == CHROMA_DC_BLOCK_INDEX)
4150             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4151         else
4152             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4153     }
4154
4155     coeff_num = zeros_left + total_coeff - 1;
4156     j = scantable[coeff_num];
4157     if(n > 24){
4158         block[j] = level[0];
4159         for(i=1;i<total_coeff;i++) {
4160             if(zeros_left <= 0)
4161                 run_before = 0;
4162             else if(zeros_left < 7){
4163                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4164             }else{
4165                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4166             }
4167             zeros_left -= run_before;
4168             coeff_num -= 1 + run_before;
4169             j= scantable[ coeff_num ];
4170
4171             block[j]= level[i];
4172         }
4173     }else{
4174         block[j] = (level[0] * qmul[j] + 32)>>6;
4175         for(i=1;i<total_coeff;i++) {
4176             if(zeros_left <= 0)
4177                 run_before = 0;
4178             else if(zeros_left < 7){
4179                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4180             }else{
4181                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4182             }
4183             zeros_left -= run_before;
4184             coeff_num -= 1 + run_before;
4185             j= scantable[ coeff_num ];
4186
4187             block[j]= (level[i] * qmul[j] + 32)>>6;
4188         }
4189     }
4190
4191     if(zeros_left<0){
4192         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4193         return -1;
4194     }
4195
4196     return 0;
4197 }
4198
4199 static void predict_field_decoding_flag(H264Context *h){
4200     MpegEncContext * const s = &h->s;
4201     const int mb_xy= h->mb_xy;
4202     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4203                 ? s->current_picture.mb_type[mb_xy-1]
4204                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4205                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4206                 : 0;
4207     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4208 }
4209
4210 /**
4211  * decodes a P_SKIP or B_SKIP macroblock
4212  */
4213 static void decode_mb_skip(H264Context *h){
4214     MpegEncContext * const s = &h->s;
4215     const int mb_xy= h->mb_xy;
4216     int mb_type=0;
4217
4218     memset(h->non_zero_count[mb_xy], 0, 16);
4219     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4220
4221     if(MB_FIELD)
4222         mb_type|= MB_TYPE_INTERLACED;
4223
4224     if( h->slice_type_nos == FF_B_TYPE )
4225     {
4226         // just for fill_caches. pred_direct_motion will set the real mb_type
4227         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4228
4229         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4230         pred_direct_motion(h, &mb_type);
4231         mb_type|= MB_TYPE_SKIP;
4232     }
4233     else
4234     {
4235         int mx, my;
4236         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4237
4238         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4239         pred_pskip_motion(h, &mx, &my);
4240         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4241         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4242     }
4243
4244     write_back_motion(h, mb_type);
4245     s->current_picture.mb_type[mb_xy]= mb_type;
4246     s->current_picture.qscale_table[mb_xy]= s->qscale;
4247     h->slice_table[ mb_xy ]= h->slice_num;
4248     h->prev_mb_skipped= 1;
4249 }
4250
4251 /**
4252  * decodes a macroblock
4253  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4254  */
4255 static int decode_mb_cavlc(H264Context *h){
4256     MpegEncContext * const s = &h->s;
4257     int mb_xy;
4258     int partition_count;
4259     unsigned int mb_type, cbp;
4260     int dct8x8_allowed= h->pps.transform_8x8_mode;
4261
4262     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4263
4264     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4265
4266     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4267     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4268                 down the code */
4269     if(h->slice_type_nos != FF_I_TYPE){
4270         if(s->mb_skip_run==-1)
4271             s->mb_skip_run= get_ue_golomb(&s->gb);
4272
4273         if (s->mb_skip_run--) {
4274             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4275                 if(s->mb_skip_run==0)
4276                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4277                 else
4278                     predict_field_decoding_flag(h);
4279             }
4280             decode_mb_skip(h);
4281             return 0;
4282         }
4283     }
4284     if(FRAME_MBAFF){
4285         if( (s->mb_y&1) == 0 )
4286             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4287     }
4288
4289     h->prev_mb_skipped= 0;
4290
4291     mb_type= get_ue_golomb(&s->gb);
4292     if(h->slice_type_nos == FF_B_TYPE){
4293         if(mb_type < 23){
4294             partition_count= b_mb_type_info[mb_type].partition_count;
4295             mb_type=         b_mb_type_info[mb_type].type;
4296         }else{
4297             mb_type -= 23;
4298             goto decode_intra_mb;
4299         }
4300     }else if(h->slice_type_nos == FF_P_TYPE){
4301         if(mb_type < 5){
4302             partition_count= p_mb_type_info[mb_type].partition_count;
4303             mb_type=         p_mb_type_info[mb_type].type;
4304         }else{
4305             mb_type -= 5;
4306             goto decode_intra_mb;
4307         }
4308     }else{
4309        assert(h->slice_type_nos == FF_I_TYPE);
4310         if(h->slice_type == FF_SI_TYPE && mb_type)
4311             mb_type--;
4312 decode_intra_mb:
4313         if(mb_type > 25){
4314             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4315             return -1;
4316         }
4317         partition_count=0;
4318         cbp= i_mb_type_info[mb_type].cbp;
4319         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4320         mb_type= i_mb_type_info[mb_type].type;
4321     }
4322
4323     if(MB_FIELD)
4324         mb_type |= MB_TYPE_INTERLACED;
4325
4326     h->slice_table[ mb_xy ]= h->slice_num;
4327
4328     if(IS_INTRA_PCM(mb_type)){
4329         unsigned int x;
4330
4331         // We assume these blocks are very rare so we do not optimize it.
4332         align_get_bits(&s->gb);
4333
4334         // The pixels are stored in the same order as levels in h->mb array.
4335         for(x=0; x < (CHROMA ? 384 : 256); x++){
4336             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4337         }
4338
4339         // In deblocking, the quantizer is 0
4340         s->current_picture.qscale_table[mb_xy]= 0;
4341         // All coeffs are present
4342         memset(h->non_zero_count[mb_xy], 16, 16);
4343
4344         s->current_picture.mb_type[mb_xy]= mb_type;
4345         return 0;
4346     }
4347
4348     if(MB_MBAFF){
4349         h->ref_count[0] <<= 1;
4350         h->ref_count[1] <<= 1;
4351     }
4352
4353     fill_caches(h, mb_type, 0);
4354
4355     //mb_pred
4356     if(IS_INTRA(mb_type)){
4357         int pred_mode;
4358 //            init_top_left_availability(h);
4359         if(IS_INTRA4x4(mb_type)){
4360             int i;
4361             int di = 1;
4362             if(dct8x8_allowed && get_bits1(&s->gb)){
4363                 mb_type |= MB_TYPE_8x8DCT;
4364                 di = 4;
4365             }
4366
4367 //                fill_intra4x4_pred_table(h);
4368             for(i=0; i<16; i+=di){
4369                 int mode= pred_intra_mode(h, i);
4370
4371                 if(!get_bits1(&s->gb)){
4372                     const int rem_mode= get_bits(&s->gb, 3);
4373                     mode = rem_mode + (rem_mode >= mode);
4374                 }
4375
4376                 if(di==4)
4377                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4378                 else
4379                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4380             }
4381             write_back_intra_pred_mode(h);
4382             if( check_intra4x4_pred_mode(h) < 0)
4383                 return -1;
4384         }else{
4385             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4386             if(h->intra16x16_pred_mode < 0)
4387                 return -1;
4388         }
4389         if(CHROMA){
4390             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4391             if(pred_mode < 0)
4392                 return -1;
4393             h->chroma_pred_mode= pred_mode;
4394         }
4395     }else if(partition_count==4){
4396         int i, j, sub_partition_count[4], list, ref[2][4];
4397
4398         if(h->slice_type_nos == FF_B_TYPE){
4399             for(i=0; i<4; i++){
4400                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4401                 if(h->sub_mb_type[i] >=13){
4402                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4403                     return -1;
4404                 }
4405                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4406                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4407             }
4408             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4409                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4410                 pred_direct_motion(h, &mb_type);
4411                 h->ref_cache[0][scan8[4]] =
4412                 h->ref_cache[1][scan8[4]] =
4413                 h->ref_cache[0][scan8[12]] =
4414                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4415             }
4416         }else{
4417             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4418             for(i=0; i<4; i++){
4419                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4420                 if(h->sub_mb_type[i] >=4){
4421                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4422                     return -1;
4423                 }
4424                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4425                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4426             }
4427         }
4428
4429         for(list=0; list<h->list_count; list++){
4430             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4431             for(i=0; i<4; i++){
4432                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4433                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4434                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4435                     if(tmp>=ref_count){
4436                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4437                         return -1;
4438                     }
4439                     ref[list][i]= tmp;
4440                 }else{
4441                  //FIXME
4442                     ref[list][i] = -1;
4443                 }
4444             }
4445         }
4446
4447         if(dct8x8_allowed)
4448             dct8x8_allowed = get_dct8x8_allowed(h);
4449
4450         for(list=0; list<h->list_count; list++){
4451             for(i=0; i<4; i++){
4452                 if(IS_DIRECT(h->sub_mb_type[i])) {
4453                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4454                     continue;
4455                 }
4456                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4457                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4458
4459                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4460                     const int sub_mb_type= h->sub_mb_type[i];
4461                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4462                     for(j=0; j<sub_partition_count[i]; j++){
4463                         int mx, my;
4464                         const int index= 4*i + block_width*j;
4465                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4466                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4467                         mx += get_se_golomb(&s->gb);
4468                         my += get_se_golomb(&s->gb);
4469                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4470
4471                         if(IS_SUB_8X8(sub_mb_type)){
4472                             mv_cache[ 1 ][0]=
4473                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4474                             mv_cache[ 1 ][1]=
4475                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4476                         }else if(IS_SUB_8X4(sub_mb_type)){
4477                             mv_cache[ 1 ][0]= mx;
4478                             mv_cache[ 1 ][1]= my;
4479                         }else if(IS_SUB_4X8(sub_mb_type)){
4480                             mv_cache[ 8 ][0]= mx;
4481                             mv_cache[ 8 ][1]= my;
4482                         }
4483                         mv_cache[ 0 ][0]= mx;
4484                         mv_cache[ 0 ][1]= my;
4485                     }
4486                 }else{
4487                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4488                     p[0] = p[1]=
4489                     p[8] = p[9]= 0;
4490                 }
4491             }
4492         }
4493     }else if(IS_DIRECT(mb_type)){
4494         pred_direct_motion(h, &mb_type);
4495         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4496     }else{
4497         int list, mx, my, i;
4498          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4499         if(IS_16X16(mb_type)){
4500             for(list=0; list<h->list_count; list++){
4501                     unsigned int val;
4502                     if(IS_DIR(mb_type, 0, list)){
4503                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4504                         if(val >= h->ref_count[list]){
4505                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4506                             return -1;
4507                         }
4508                     }else
4509                         val= LIST_NOT_USED&0xFF;
4510                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4511             }
4512             for(list=0; list<h->list_count; list++){
4513                 unsigned int val;
4514                 if(IS_DIR(mb_type, 0, list)){
4515                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4516                     mx += get_se_golomb(&s->gb);
4517                     my += get_se_golomb(&s->gb);
4518                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4519
4520                     val= pack16to32(mx,my);
4521                 }else
4522                     val=0;
4523                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4524             }
4525         }
4526         else if(IS_16X8(mb_type)){
4527             for(list=0; list<h->list_count; list++){
4528                     for(i=0; i<2; i++){
4529                         unsigned int val;
4530                         if(IS_DIR(mb_type, i, list)){
4531                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4532                             if(val >= h->ref_count[list]){
4533                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4534                                 return -1;
4535                             }
4536                         }else
4537                             val= LIST_NOT_USED&0xFF;
4538                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4539                     }
4540             }
4541             for(list=0; list<h->list_count; list++){
4542                 for(i=0; i<2; i++){
4543                     unsigned int val;
4544                     if(IS_DIR(mb_type, i, list)){
4545                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4546                         mx += get_se_golomb(&s->gb);
4547                         my += get_se_golomb(&s->gb);
4548                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4549
4550                         val= pack16to32(mx,my);
4551                     }else
4552                         val=0;
4553                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4554                 }
4555             }
4556         }else{
4557             assert(IS_8X16(mb_type));
4558             for(list=0; list<h->list_count; list++){
4559                     for(i=0; i<2; i++){
4560                         unsigned int val;
4561                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4562                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4563                             if(val >= h->ref_count[list]){
4564                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4565                                 return -1;
4566                             }
4567                         }else
4568                             val= LIST_NOT_USED&0xFF;
4569                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4570                     }
4571             }
4572             for(list=0; list<h->list_count; list++){
4573                 for(i=0; i<2; i++){
4574                     unsigned int val;
4575                     if(IS_DIR(mb_type, i, list)){
4576                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4577                         mx += get_se_golomb(&s->gb);
4578                         my += get_se_golomb(&s->gb);
4579                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4580
4581                         val= pack16to32(mx,my);
4582                     }else
4583                         val=0;
4584                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4585                 }
4586             }
4587         }
4588     }
4589
4590     if(IS_INTER(mb_type))
4591         write_back_motion(h, mb_type);
4592
4593     if(!IS_INTRA16x16(mb_type)){
4594         cbp= get_ue_golomb(&s->gb);
4595         if(cbp > 47){
4596             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4597             return -1;
4598         }
4599
4600         if(CHROMA){
4601             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4602             else                     cbp= golomb_to_inter_cbp   [cbp];
4603         }else{
4604             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4605             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4606         }
4607     }
4608     h->cbp = cbp;
4609
4610     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4611         if(get_bits1(&s->gb)){
4612             mb_type |= MB_TYPE_8x8DCT;
4613             h->cbp_table[mb_xy]= cbp;
4614         }
4615     }
4616     s->current_picture.mb_type[mb_xy]= mb_type;
4617
4618     if(cbp || IS_INTRA16x16(mb_type)){
4619         int i8x8, i4x4, chroma_idx;
4620         int dquant;
4621         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4622         const uint8_t *scan, *scan8x8, *dc_scan;
4623
4624 //        fill_non_zero_count_cache(h);
4625
4626         if(IS_INTERLACED(mb_type)){
4627             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4628             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4629             dc_scan= luma_dc_field_scan;
4630         }else{
4631             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4632             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4633             dc_scan= luma_dc_zigzag_scan;
4634         }
4635
4636         dquant= get_se_golomb(&s->gb);
4637
4638         if( dquant > 25 || dquant < -26 ){
4639             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4640             return -1;
4641         }
4642
4643         s->qscale += dquant;
4644         if(((unsigned)s->qscale) > 51){
4645             if(s->qscale<0) s->qscale+= 52;
4646             else            s->qscale-= 52;
4647         }
4648
4649         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4650         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4651         if(IS_INTRA16x16(mb_type)){
4652             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4653                 return -1; //FIXME continue if partitioned and other return -1 too
4654             }
4655
4656             assert((cbp&15) == 0 || (cbp&15) == 15);
4657
4658             if(cbp&15){
4659                 for(i8x8=0; i8x8<4; i8x8++){
4660                     for(i4x4=0; i4x4<4; i4x4++){
4661                         const int index= i4x4 + 4*i8x8;
4662                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4663                             return -1;
4664                         }
4665                     }
4666                 }
4667             }else{
4668                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4669             }
4670         }else{
4671             for(i8x8=0; i8x8<4; i8x8++){
4672                 if(cbp & (1<<i8x8)){
4673                     if(IS_8x8DCT(mb_type)){
4674                         DCTELEM *buf = &h->mb[64*i8x8];
4675                         uint8_t *nnz;
4676                         for(i4x4=0; i4x4<4; i4x4++){
4677                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4678                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4679                                 return -1;
4680                         }
4681                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4682                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4683                     }else{
4684                         for(i4x4=0; i4x4<4; i4x4++){
4685                             const int index= i4x4 + 4*i8x8;
4686
4687                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4688                                 return -1;
4689                             }
4690                         }
4691                     }
4692                 }else{
4693                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4694                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4695                 }
4696             }
4697         }
4698
4699         if(cbp&0x30){
4700             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4701                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4702                     return -1;
4703                 }
4704         }
4705
4706         if(cbp&0x20){
4707             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4708                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4709                 for(i4x4=0; i4x4<4; i4x4++){
4710                     const int index= 16 + 4*chroma_idx + i4x4;
4711                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4712                         return -1;
4713                     }
4714                 }
4715             }
4716         }else{
4717             uint8_t * const nnz= &h->non_zero_count_cache[0];
4718             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4719             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4720         }
4721     }else{
4722         uint8_t * const nnz= &h->non_zero_count_cache[0];
4723         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4724         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4725         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4726     }
4727     s->current_picture.qscale_table[mb_xy]= s->qscale;
4728     write_back_non_zero_count(h);
4729
4730     if(MB_MBAFF){
4731         h->ref_count[0] >>= 1;
4732         h->ref_count[1] >>= 1;
4733     }
4734
4735     return 0;
4736 }
4737
4738 static int decode_cabac_field_decoding_flag(H264Context *h) {
4739     MpegEncContext * const s = &h->s;
4740     const int mb_x = s->mb_x;
4741     const int mb_y = s->mb_y & ~1;
4742     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4743     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4744
4745     unsigned int ctx = 0;
4746
4747     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4748         ctx += 1;
4749     }
4750     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4751         ctx += 1;
4752     }
4753
4754     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4755 }
4756
4757 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4758     uint8_t *state= &h->cabac_state[ctx_base];
4759     int mb_type;
4760
4761     if(intra_slice){
4762         MpegEncContext * const s = &h->s;
4763         const int mba_xy = h->left_mb_xy[0];
4764         const int mbb_xy = h->top_mb_xy;
4765         int ctx=0;
4766         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4767             ctx++;
4768         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4769             ctx++;
4770         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4771             return 0;   /* I4x4 */
4772         state += 2;
4773     }else{
4774         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4775             return 0;   /* I4x4 */
4776     }
4777
4778     if( get_cabac_terminate( &h->cabac ) )
4779         return 25;  /* PCM */
4780
4781     mb_type = 1; /* I16x16 */
4782     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4783     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4784         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4785     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4786     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4787     return mb_type;
4788 }
4789
4790 static int decode_cabac_mb_type( H264Context *h ) {
4791     MpegEncContext * const s = &h->s;
4792
4793     if( h->slice_type_nos == FF_I_TYPE ) {
4794         return decode_cabac_intra_mb_type(h, 3, 1);
4795     } else if( h->slice_type_nos == FF_P_TYPE ) {
4796         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4797             /* P-type */
4798             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4799                 /* P_L0_D16x16, P_8x8 */
4800                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4801             } else {
4802                 /* P_L0_D8x16, P_L0_D16x8 */
4803                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4804             }
4805         } else {
4806             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4807         }
4808     } else {
4809         const int mba_xy = h->left_mb_xy[0];
4810         const int mbb_xy = h->top_mb_xy;
4811         int ctx = 0;
4812         int bits;
4813         assert(h->slice_type_nos == FF_B_TYPE);
4814
4815         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4816             ctx++;
4817         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4818             ctx++;
4819
4820         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4821             return 0; /* B_Direct_16x16 */
4822
4823         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4824             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4825         }
4826
4827         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4828         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4829         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4830         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4831         if( bits < 8 )
4832             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4833         else if( bits == 13 ) {
4834             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4835         } else if( bits == 14 )
4836             return 11; /* B_L1_L0_8x16 */
4837         else if( bits == 15 )
4838             return 22; /* B_8x8 */
4839
4840         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4841         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4842     }
4843 }
4844
4845 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4846     MpegEncContext * const s = &h->s;
4847     int mba_xy, mbb_xy;
4848     int ctx = 0;
4849
4850     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4851         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4852         mba_xy = mb_xy - 1;
4853         if( (mb_y&1)
4854             && h->slice_table[mba_xy] == h->slice_num
4855             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4856             mba_xy += s->mb_stride;
4857         if( MB_FIELD ){
4858             mbb_xy = mb_xy - s->mb_stride;
4859             if( !(mb_y&1)
4860                 && h->slice_table[mbb_xy] == h->slice_num
4861                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4862                 mbb_xy -= s->mb_stride;
4863         }else
4864             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4865     }else{
4866         int mb_xy = h->mb_xy;
4867         mba_xy = mb_xy - 1;
4868         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4869     }
4870
4871     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4872         ctx++;
4873     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4874         ctx++;
4875
4876     if( h->slice_type_nos == FF_B_TYPE )
4877         ctx += 13;
4878     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4879 }
4880
4881 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4882     int mode = 0;
4883
4884     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4885         return pred_mode;
4886
4887     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4888     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4889     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4890
4891     if( mode >= pred_mode )
4892         return mode + 1;
4893     else
4894         return mode;
4895 }
4896
4897 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4898     const int mba_xy = h->left_mb_xy[0];
4899     const int mbb_xy = h->top_mb_xy;
4900
4901     int ctx = 0;
4902
4903     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4904     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4905         ctx++;
4906
4907     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4908         ctx++;
4909
4910     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4911         return 0;
4912
4913     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4914         return 1;
4915     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4916         return 2;
4917     else
4918         return 3;
4919 }
4920
4921 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4922     int cbp_b, cbp_a, ctx, cbp = 0;
4923
4924     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4925     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4926
4927     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4928     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4929     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4930     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4931     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4932     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4933     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4934     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4935     return cbp;
4936 }
4937 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4938     int ctx;
4939     int cbp_a, cbp_b;
4940
4941     cbp_a = (h->left_cbp>>4)&0x03;
4942     cbp_b = (h-> top_cbp>>4)&0x03;
4943
4944     ctx = 0;
4945     if( cbp_a > 0 ) ctx++;
4946     if( cbp_b > 0 ) ctx += 2;
4947     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4948         return 0;
4949
4950     ctx = 4;
4951     if( cbp_a == 2 ) ctx++;
4952     if( cbp_b == 2 ) ctx += 2;
4953     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4954 }
4955 static int decode_cabac_mb_dqp( H264Context *h) {
4956     int   ctx = 0;
4957     int   val = 0;
4958
4959     if( h->last_qscale_diff != 0 )
4960         ctx++;
4961
4962     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4963         if( ctx < 2 )
4964             ctx = 2;
4965         else
4966             ctx = 3;
4967         val++;
4968         if(val > 102) //prevent infinite loop
4969             return INT_MIN;
4970     }
4971
4972     if( val&0x01 )
4973         return (val + 1)/2;
4974     else
4975         return -(val + 1)/2;
4976 }
4977 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4978     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4979         return 0;   /* 8x8 */
4980     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4981         return 1;   /* 8x4 */
4982     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4983         return 2;   /* 4x8 */
4984     return 3;       /* 4x4 */
4985 }
4986 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4987     int type;
4988     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4989         return 0;   /* B_Direct_8x8 */
4990     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4991         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4992     type = 3;
4993     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4994         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4995             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4996         type += 4;
4997     }
4998     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4999     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5000     return type;
5001 }
5002
5003 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5004     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5005 }
5006
5007 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5008     int refa = h->ref_cache[list][scan8[n] - 1];
5009     int refb = h->ref_cache[list][scan8[n] - 8];
5010     int ref  = 0;
5011     int ctx  = 0;
5012
5013     if( h->slice_type_nos == FF_B_TYPE) {
5014         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5015             ctx++;
5016         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5017             ctx += 2;
5018     } else {
5019         if( refa > 0 )
5020             ctx++;
5021         if( refb > 0 )
5022             ctx += 2;
5023     }
5024
5025     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5026         ref++;
5027         if( ctx < 4 )
5028             ctx = 4;
5029         else
5030             ctx = 5;
5031         if(ref >= 32 /*h->ref_list[list]*/){
5032             return -1;
5033         }
5034     }
5035     return ref;
5036 }
5037
5038 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5039     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5040                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5041     int ctxbase = (l == 0) ? 40 : 47;
5042     int mvd;
5043     int ctx = (amvd>2) + (amvd>32);
5044
5045     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5046         return 0;
5047
5048     mvd= 1;
5049     ctx= 3;
5050     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5051         mvd++;
5052         if( ctx < 6 )
5053             ctx++;
5054     }
5055
5056     if( mvd >= 9 ) {
5057         int k = 3;
5058         while( get_cabac_bypass( &h->cabac ) ) {
5059             mvd += 1 << k;
5060             k++;
5061             if(k>24){
5062                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5063                 return INT_MIN;
5064             }
5065         }
5066         while( k-- ) {
5067             if( get_cabac_bypass( &h->cabac ) )
5068                 mvd += 1 << k;
5069         }
5070     }
5071     return get_cabac_bypass_sign( &h->cabac, -mvd );
5072 }
5073
5074 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5075     int nza, nzb;
5076     int ctx = 0;
5077
5078     if( is_dc ) {
5079         if( cat == 0 ) {
5080             nza = h->left_cbp&0x100;
5081             nzb = h-> top_cbp&0x100;
5082         } else {
5083             nza = (h->left_cbp>>(6+idx))&0x01;
5084             nzb = (h-> top_cbp>>(6+idx))&0x01;
5085         }
5086     } else {
5087         assert(cat == 1 || cat == 2 || cat == 4);
5088         nza = h->non_zero_count_cache[scan8[idx] - 1];
5089         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5090     }
5091
5092     if( nza > 0 )
5093         ctx++;
5094
5095     if( nzb > 0 )
5096         ctx += 2;
5097
5098     return ctx + 4 * cat;
5099 }
5100
5101 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5102     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5103     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5104     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5105     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5106 };
5107
5108 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5109     static const int significant_coeff_flag_offset[2][6] = {
5110       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5111       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5112     };
5113     static const int last_coeff_flag_offset[2][6] = {
5114       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5115       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5116     };
5117     static const int coeff_abs_level_m1_offset[6] = {
5118         227+0, 227+10, 227+20, 227+30, 227+39, 426
5119     };
5120     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5121       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5122         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5123         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5124        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5125       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5126         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5127         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5128         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5129     };
5130     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5131      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5132      * map node ctx => cabac ctx for level=1 */
5133     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5134     /* map node ctx => cabac ctx for level>1 */
5135     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5136     static const uint8_t coeff_abs_level_transition[2][8] = {
5137     /* update node ctx after decoding a level=1 */
5138         { 1, 2, 3, 3, 4, 5, 6, 7 },
5139     /* update node ctx after decoding a level>1 */
5140         { 4, 4, 4, 4, 5, 6, 7, 7 }
5141     };
5142
5143     int index[64];
5144
5145     int av_unused last;
5146     int coeff_count = 0;
5147     int node_ctx = 0;
5148
5149     uint8_t *significant_coeff_ctx_base;
5150     uint8_t *last_coeff_ctx_base;
5151     uint8_t *abs_level_m1_ctx_base;
5152
5153 #ifndef ARCH_X86
5154 #define CABAC_ON_STACK
5155 #endif
5156 #ifdef CABAC_ON_STACK
5157 #define CC &cc
5158     CABACContext cc;
5159     cc.range     = h->cabac.range;
5160     cc.low       = h->cabac.low;
5161     cc.bytestream= h->cabac.bytestream;
5162 #else
5163 #define CC &h->cabac
5164 #endif
5165
5166
5167     /* cat: 0-> DC 16x16  n = 0
5168      *      1-> AC 16x16  n = luma4x4idx
5169      *      2-> Luma4x4   n = luma4x4idx
5170      *      3-> DC Chroma n = iCbCr
5171      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5172      *      5-> Luma8x8   n = 4 * luma8x8idx
5173      */
5174
5175     /* read coded block flag */
5176     if( is_dc || cat != 5 ) {
5177         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5178             if( !is_dc )
5179                 h->non_zero_count_cache[scan8[n]] = 0;
5180
5181 #ifdef CABAC_ON_STACK
5182             h->cabac.range     = cc.range     ;
5183             h->cabac.low       = cc.low       ;
5184             h->cabac.bytestream= cc.bytestream;
5185 #endif
5186             return;
5187         }
5188     }
5189
5190     significant_coeff_ctx_base = h->cabac_state
5191         + significant_coeff_flag_offset[MB_FIELD][cat];
5192     last_coeff_ctx_base = h->cabac_state
5193         + last_coeff_flag_offset[MB_FIELD][cat];
5194     abs_level_m1_ctx_base = h->cabac_state
5195         + coeff_abs_level_m1_offset[cat];
5196
5197     if( !is_dc && cat == 5 ) {
5198 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5199         for(last= 0; last < coefs; last++) { \
5200             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5201             if( get_cabac( CC, sig_ctx )) { \
5202                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5203                 index[coeff_count++] = last; \
5204                 if( get_cabac( CC, last_ctx ) ) { \
5205                     last= max_coeff; \
5206                     break; \
5207                 } \
5208             } \
5209         }\
5210         if( last == max_coeff -1 ) {\
5211             index[coeff_count++] = last;\
5212         }
5213         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5214 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5215         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5216     } else {
5217         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5218 #else
5219         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5220     } else {
5221         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5222 #endif
5223     }
5224     assert(coeff_count > 0);
5225
5226     if( is_dc ) {
5227         if( cat == 0 )
5228             h->cbp_table[h->mb_xy] |= 0x100;
5229         else
5230             h->cbp_table[h->mb_xy] |= 0x40 << n;
5231     } else {
5232         if( cat == 5 )
5233             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5234         else {
5235             assert( cat == 1 || cat == 2 || cat == 4 );
5236             h->non_zero_count_cache[scan8[n]] = coeff_count;
5237         }
5238     }
5239
5240     do {
5241         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5242
5243         int j= scantable[index[--coeff_count]];
5244
5245         if( get_cabac( CC, ctx ) == 0 ) {
5246             node_ctx = coeff_abs_level_transition[0][node_ctx];
5247             if( is_dc ) {
5248                 block[j] = get_cabac_bypass_sign( CC, -1);
5249             }else{
5250                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5251             }
5252         } else {
5253             int coeff_abs = 2;
5254             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5255             node_ctx = coeff_abs_level_transition[1][node_ctx];
5256
5257             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5258                 coeff_abs++;
5259             }
5260
5261             if( coeff_abs >= 15 ) {
5262                 int j = 0;
5263                 while( get_cabac_bypass( CC ) ) {
5264                     j++;
5265                 }
5266
5267                 coeff_abs=1;
5268                 while( j-- ) {
5269                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5270                 }
5271                 coeff_abs+= 14;
5272             }
5273
5274             if( is_dc ) {
5275                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5276             }else{
5277                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5278             }
5279         }
5280     } while( coeff_count );
5281 #ifdef CABAC_ON_STACK
5282             h->cabac.range     = cc.range     ;
5283             h->cabac.low       = cc.low       ;
5284             h->cabac.bytestream= cc.bytestream;
5285 #endif
5286
5287 }
5288
5289 #ifndef CONFIG_SMALL
5290 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5291     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5292 }
5293
5294 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5295     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5296 }
5297 #endif
5298
5299 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5300 #ifdef CONFIG_SMALL
5301     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5302 #else
5303     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5304     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5305 #endif
5306 }
5307
5308 static inline void compute_mb_neighbors(H264Context *h)
5309 {
5310     MpegEncContext * const s = &h->s;
5311     const int mb_xy  = h->mb_xy;
5312     h->top_mb_xy     = mb_xy - s->mb_stride;
5313     h->left_mb_xy[0] = mb_xy - 1;
5314     if(FRAME_MBAFF){
5315         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5316         const int top_pair_xy      = pair_xy     - s->mb_stride;
5317         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5318         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5319         const int curr_mb_frame_flag = !MB_FIELD;
5320         const int bottom = (s->mb_y & 1);
5321         if (bottom
5322                 ? !curr_mb_frame_flag // bottom macroblock
5323                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5324                 ) {
5325             h->top_mb_xy -= s->mb_stride;
5326         }
5327         if (left_mb_frame_flag != curr_mb_frame_flag) {
5328             h->left_mb_xy[0] = pair_xy - 1;
5329         }
5330     } else if (FIELD_PICTURE) {
5331         h->top_mb_xy -= s->mb_stride;
5332     }
5333     return;
5334 }
5335
5336 /**
5337  * decodes a macroblock
5338  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5339  */
5340 static int decode_mb_cabac(H264Context *h) {
5341     MpegEncContext * const s = &h->s;
5342     int mb_xy;
5343     int mb_type, partition_count, cbp = 0;
5344     int dct8x8_allowed= h->pps.transform_8x8_mode;
5345
5346     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5347
5348     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5349
5350     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5351     if( h->slice_type_nos != FF_I_TYPE ) {
5352         int skip;
5353         /* a skipped mb needs the aff flag from the following mb */
5354         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5355             predict_field_decoding_flag(h);
5356         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5357             skip = h->next_mb_skipped;
5358         else
5359             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5360         /* read skip flags */
5361         if( skip ) {
5362             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5363                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5364                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5365                 if(h->next_mb_skipped)
5366                     predict_field_decoding_flag(h);
5367                 else
5368                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5369             }
5370
5371             decode_mb_skip(h);
5372
5373             h->cbp_table[mb_xy] = 0;
5374             h->chroma_pred_mode_table[mb_xy] = 0;
5375             h->last_qscale_diff = 0;
5376
5377             return 0;
5378
5379         }
5380     }
5381     if(FRAME_MBAFF){
5382         if( (s->mb_y&1) == 0 )
5383             h->mb_mbaff =
5384             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5385     }
5386
5387     h->prev_mb_skipped = 0;
5388
5389     compute_mb_neighbors(h);
5390     mb_type = decode_cabac_mb_type( h );
5391     assert(mb_type >= 0);
5392
5393     if( h->slice_type_nos == FF_B_TYPE ) {
5394         if( mb_type < 23 ){
5395             partition_count= b_mb_type_info[mb_type].partition_count;
5396             mb_type=         b_mb_type_info[mb_type].type;
5397         }else{
5398             mb_type -= 23;
5399             goto decode_intra_mb;
5400         }
5401     } else if( h->slice_type_nos == FF_P_TYPE ) {
5402         if( mb_type < 5) {
5403             partition_count= p_mb_type_info[mb_type].partition_count;
5404             mb_type=         p_mb_type_info[mb_type].type;
5405         } else {
5406             mb_type -= 5;
5407             goto decode_intra_mb;
5408         }
5409     } else {
5410         if(h->slice_type == FF_SI_TYPE && mb_type)
5411             mb_type--;
5412         assert(h->slice_type_nos == FF_I_TYPE);
5413 decode_intra_mb:
5414         partition_count = 0;
5415         cbp= i_mb_type_info[mb_type].cbp;
5416         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5417         mb_type= i_mb_type_info[mb_type].type;
5418     }
5419     if(MB_FIELD)
5420         mb_type |= MB_TYPE_INTERLACED;
5421
5422     h->slice_table[ mb_xy ]= h->slice_num;
5423
5424     if(IS_INTRA_PCM(mb_type)) {
5425         const uint8_t *ptr;
5426
5427         // We assume these blocks are very rare so we do not optimize it.
5428         // FIXME The two following lines get the bitstream position in the cabac
5429         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5430         ptr= h->cabac.bytestream;
5431         if(h->cabac.low&0x1) ptr--;
5432         if(CABAC_BITS==16){
5433             if(h->cabac.low&0x1FF) ptr--;
5434         }
5435
5436         // The pixels are stored in the same order as levels in h->mb array.
5437         memcpy(h->mb, ptr, 256); ptr+=256;
5438         if(CHROMA){
5439             memcpy(h->mb+128, ptr, 128); ptr+=128;
5440         }
5441
5442         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5443
5444         // All blocks are present
5445         h->cbp_table[mb_xy] = 0x1ef;
5446         h->chroma_pred_mode_table[mb_xy] = 0;
5447         // In deblocking, the quantizer is 0
5448         s->current_picture.qscale_table[mb_xy]= 0;
5449         // All coeffs are present
5450         memset(h->non_zero_count[mb_xy], 16, 16);
5451         s->current_picture.mb_type[mb_xy]= mb_type;
5452         h->last_qscale_diff = 0;
5453         return 0;
5454     }
5455
5456     if(MB_MBAFF){
5457         h->ref_count[0] <<= 1;
5458         h->ref_count[1] <<= 1;
5459     }
5460
5461     fill_caches(h, mb_type, 0);
5462
5463     if( IS_INTRA( mb_type ) ) {
5464         int i, pred_mode;
5465         if( IS_INTRA4x4( mb_type ) ) {
5466             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5467                 mb_type |= MB_TYPE_8x8DCT;
5468                 for( i = 0; i < 16; i+=4 ) {
5469                     int pred = pred_intra_mode( h, i );
5470                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5471                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5472                 }
5473             } else {
5474                 for( i = 0; i < 16; i++ ) {
5475                     int pred = pred_intra_mode( h, i );
5476                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5477
5478                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5479                 }
5480             }
5481             write_back_intra_pred_mode(h);
5482             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5483         } else {
5484             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5485             if( h->intra16x16_pred_mode < 0 ) return -1;
5486         }
5487         if(CHROMA){
5488             h->chroma_pred_mode_table[mb_xy] =
5489             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5490
5491             pred_mode= check_intra_pred_mode( h, pred_mode );
5492             if( pred_mode < 0 ) return -1;
5493             h->chroma_pred_mode= pred_mode;
5494         }
5495     } else if( partition_count == 4 ) {
5496         int i, j, sub_partition_count[4], list, ref[2][4];
5497
5498         if( h->slice_type_nos == FF_B_TYPE ) {
5499             for( i = 0; i < 4; i++ ) {
5500                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5501                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5502                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5503             }
5504             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5505                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5506                 pred_direct_motion(h, &mb_type);
5507                 h->ref_cache[0][scan8[4]] =
5508                 h->ref_cache[1][scan8[4]] =
5509                 h->ref_cache[0][scan8[12]] =
5510                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5511                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5512                     for( i = 0; i < 4; i++ )
5513                         if( IS_DIRECT(h->sub_mb_type[i]) )
5514                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5515                 }
5516             }
5517         } else {
5518             for( i = 0; i < 4; i++ ) {
5519                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5520                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5521                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5522             }
5523         }
5524
5525         for( list = 0; list < h->list_count; list++ ) {
5526                 for( i = 0; i < 4; i++ ) {
5527                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5528                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5529                         if( h->ref_count[list] > 1 ){
5530                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5531                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5532                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5533                                 return -1;
5534                             }
5535                         }else
5536                             ref[list][i] = 0;
5537                     } else {
5538                         ref[list][i] = -1;
5539                     }
5540                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5541                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5542                 }
5543         }
5544
5545         if(dct8x8_allowed)
5546             dct8x8_allowed = get_dct8x8_allowed(h);
5547
5548         for(list=0; list<h->list_count; list++){
5549             for(i=0; i<4; i++){
5550                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5551                 if(IS_DIRECT(h->sub_mb_type[i])){
5552                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5553                     continue;
5554                 }
5555
5556                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5557                     const int sub_mb_type= h->sub_mb_type[i];
5558                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5559                     for(j=0; j<sub_partition_count[i]; j++){
5560                         int mpx, mpy;
5561                         int mx, my;
5562                         const int index= 4*i + block_width*j;
5563                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5564                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5565                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5566
5567                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5568                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5569                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5570
5571                         if(IS_SUB_8X8(sub_mb_type)){
5572                             mv_cache[ 1 ][0]=
5573                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5574                             mv_cache[ 1 ][1]=
5575                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5576
5577                             mvd_cache[ 1 ][0]=
5578                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5579                             mvd_cache[ 1 ][1]=
5580                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5581                         }else if(IS_SUB_8X4(sub_mb_type)){
5582                             mv_cache[ 1 ][0]= mx;
5583                             mv_cache[ 1 ][1]= my;
5584
5585                             mvd_cache[ 1 ][0]= mx - mpx;
5586                             mvd_cache[ 1 ][1]= my - mpy;
5587                         }else if(IS_SUB_4X8(sub_mb_type)){
5588                             mv_cache[ 8 ][0]= mx;
5589                             mv_cache[ 8 ][1]= my;
5590
5591                             mvd_cache[ 8 ][0]= mx - mpx;
5592                             mvd_cache[ 8 ][1]= my - mpy;
5593                         }
5594                         mv_cache[ 0 ][0]= mx;
5595                         mv_cache[ 0 ][1]= my;
5596
5597                         mvd_cache[ 0 ][0]= mx - mpx;
5598                         mvd_cache[ 0 ][1]= my - mpy;
5599                     }
5600                 }else{
5601                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5602                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5603                     p[0] = p[1] = p[8] = p[9] = 0;
5604                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5605                 }
5606             }
5607         }
5608     } else if( IS_DIRECT(mb_type) ) {
5609         pred_direct_motion(h, &mb_type);
5610         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5611         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5612         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5613     } else {
5614         int list, mx, my, i, mpx, mpy;
5615         if(IS_16X16(mb_type)){
5616             for(list=0; list<h->list_count; list++){
5617                 if(IS_DIR(mb_type, 0, list)){
5618                     int ref;
5619                     if(h->ref_count[list] > 1){
5620                         ref= decode_cabac_mb_ref(h, list, 0);
5621                         if(ref >= (unsigned)h->ref_count[list]){
5622                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5623                             return -1;
5624                         }
5625                     }else
5626                         ref=0;
5627                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5628                 }else
5629                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5630             }
5631             for(list=0; list<h->list_count; list++){
5632                 if(IS_DIR(mb_type, 0, list)){
5633                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5634
5635                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5636                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5637                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5638
5639                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5640                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5641                 }else
5642                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5643             }
5644         }
5645         else if(IS_16X8(mb_type)){
5646             for(list=0; list<h->list_count; list++){
5647                     for(i=0; i<2; i++){
5648                         if(IS_DIR(mb_type, i, list)){
5649                             int ref;
5650                             if(h->ref_count[list] > 1){
5651                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5652                                 if(ref >= (unsigned)h->ref_count[list]){
5653                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5654                                     return -1;
5655                                 }
5656                             }else
5657                                 ref=0;
5658                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5659                         }else
5660                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5661                     }
5662             }
5663             for(list=0; list<h->list_count; list++){
5664                 for(i=0; i<2; i++){
5665                     if(IS_DIR(mb_type, i, list)){
5666                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5667                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5668                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5669                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5670
5671                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5672                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5673                     }else{
5674                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5675                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5676                     }
5677                 }
5678             }
5679         }else{
5680             assert(IS_8X16(mb_type));
5681             for(list=0; list<h->list_count; list++){
5682                     for(i=0; i<2; i++){
5683                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5684                             int ref;
5685                             if(h->ref_count[list] > 1){
5686                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5687                                 if(ref >= (unsigned)h->ref_count[list]){
5688                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5689                                     return -1;
5690                                 }
5691                             }else
5692                                 ref=0;
5693                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5694                         }else
5695                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5696                     }
5697             }
5698             for(list=0; list<h->list_count; list++){
5699                 for(i=0; i<2; i++){
5700                     if(IS_DIR(mb_type, i, list)){
5701                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5702                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5703                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5704
5705                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5706                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5707                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5708                     }else{
5709                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5710                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5711                     }
5712                 }
5713             }
5714         }
5715     }
5716
5717    if( IS_INTER( mb_type ) ) {
5718         h->chroma_pred_mode_table[mb_xy] = 0;
5719         write_back_motion( h, mb_type );
5720    }
5721
5722     if( !IS_INTRA16x16( mb_type ) ) {
5723         cbp  = decode_cabac_mb_cbp_luma( h );
5724         if(CHROMA)
5725             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5726     }
5727
5728     h->cbp_table[mb_xy] = h->cbp = cbp;
5729
5730     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5731         if( decode_cabac_mb_transform_size( h ) )
5732             mb_type |= MB_TYPE_8x8DCT;
5733     }
5734     s->current_picture.mb_type[mb_xy]= mb_type;
5735
5736     if( cbp || IS_INTRA16x16( mb_type ) ) {
5737         const uint8_t *scan, *scan8x8, *dc_scan;
5738         const uint32_t *qmul;
5739         int dqp;
5740
5741         if(IS_INTERLACED(mb_type)){
5742             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5743             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5744             dc_scan= luma_dc_field_scan;
5745         }else{
5746             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5747             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5748             dc_scan= luma_dc_zigzag_scan;
5749         }
5750
5751         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5752         if( dqp == INT_MIN ){
5753             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5754             return -1;
5755         }
5756         s->qscale += dqp;
5757         if(((unsigned)s->qscale) > 51){
5758             if(s->qscale<0) s->qscale+= 52;
5759             else            s->qscale-= 52;
5760         }
5761         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5762         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5763
5764         if( IS_INTRA16x16( mb_type ) ) {
5765             int i;
5766             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5767             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5768
5769             if( cbp&15 ) {
5770                 qmul = h->dequant4_coeff[0][s->qscale];
5771                 for( i = 0; i < 16; i++ ) {
5772                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5773                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5774                 }
5775             } else {
5776                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5777             }
5778         } else {
5779             int i8x8, i4x4;
5780             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5781                 if( cbp & (1<<i8x8) ) {
5782                     if( IS_8x8DCT(mb_type) ) {
5783                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5784                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5785                     } else {
5786                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5787                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5788                             const int index = 4*i8x8 + i4x4;
5789                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5790 //START_TIMER
5791                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5792 //STOP_TIMER("decode_residual")
5793                         }
5794                     }
5795                 } else {
5796                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5797                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5798                 }
5799             }
5800         }
5801
5802         if( cbp&0x30 ){
5803             int c;
5804             for( c = 0; c < 2; c++ ) {
5805                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5806                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5807             }
5808         }
5809
5810         if( cbp&0x20 ) {
5811             int c, i;
5812             for( c = 0; c < 2; c++ ) {
5813                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5814                 for( i = 0; i < 4; i++ ) {
5815                     const int index = 16 + 4 * c + i;
5816                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5817                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5818                 }
5819             }
5820         } else {
5821             uint8_t * const nnz= &h->non_zero_count_cache[0];
5822             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5823             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5824         }
5825     } else {
5826         uint8_t * const nnz= &h->non_zero_count_cache[0];
5827         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5828         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5829         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5830         h->last_qscale_diff = 0;
5831     }
5832
5833     s->current_picture.qscale_table[mb_xy]= s->qscale;
5834     write_back_non_zero_count(h);
5835
5836     if(MB_MBAFF){
5837         h->ref_count[0] >>= 1;
5838         h->ref_count[1] >>= 1;
5839     }
5840
5841     return 0;
5842 }
5843
5844
5845 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5846     int i, d;
5847     const int index_a = qp + h->slice_alpha_c0_offset;
5848     const int alpha = (alpha_table+52)[index_a];
5849     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5850
5851     if( bS[0] < 4 ) {
5852         int8_t tc[4];
5853         for(i=0; i<4; i++)
5854             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5855         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5856     } else {
5857         /* 16px edge length, because bS=4 is triggered by being at
5858          * the edge of an intra MB, so all 4 bS are the same */
5859             for( d = 0; d < 16; d++ ) {
5860                 const int p0 = pix[-1];
5861                 const int p1 = pix[-2];
5862                 const int p2 = pix[-3];
5863
5864                 const int q0 = pix[0];
5865                 const int q1 = pix[1];
5866                 const int q2 = pix[2];
5867
5868                 if( FFABS( p0 - q0 ) < alpha &&
5869                     FFABS( p1 - p0 ) < beta &&
5870                     FFABS( q1 - q0 ) < beta ) {
5871
5872                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5873                         if( FFABS( p2 - p0 ) < beta)
5874                         {
5875                             const int p3 = pix[-4];
5876                             /* p0', p1', p2' */
5877                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5878                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5879                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5880                         } else {
5881                             /* p0' */
5882                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5883                         }
5884                         if( FFABS( q2 - q0 ) < beta)
5885                         {
5886                             const int q3 = pix[3];
5887                             /* q0', q1', q2' */
5888                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5889                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5890                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5891                         } else {
5892                             /* q0' */
5893                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5894                         }
5895                     }else{
5896                         /* p0', q0' */
5897                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5898                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5899                     }
5900                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5901                 }
5902                 pix += stride;
5903             }
5904     }
5905 }
5906 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5907     int i;
5908     const int index_a = qp + h->slice_alpha_c0_offset;
5909     const int alpha = (alpha_table+52)[index_a];
5910     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5911
5912     if( bS[0] < 4 ) {
5913         int8_t tc[4];
5914         for(i=0; i<4; i++)
5915             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5916         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5917     } else {
5918         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5919     }
5920 }
5921
5922 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5923     int i;
5924     for( i = 0; i < 16; i++, pix += stride) {
5925         int index_a;
5926         int alpha;
5927         int beta;
5928
5929         int qp_index;
5930         int bS_index = (i >> 1);
5931         if (!MB_FIELD) {
5932             bS_index &= ~1;
5933             bS_index |= (i & 1);
5934         }
5935
5936         if( bS[bS_index] == 0 ) {
5937             continue;
5938         }
5939
5940         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5941         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5942         alpha = (alpha_table+52)[index_a];
5943         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5944
5945         if( bS[bS_index] < 4 ) {
5946             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5947             const int p0 = pix[-1];
5948             const int p1 = pix[-2];
5949             const int p2 = pix[-3];
5950             const int q0 = pix[0];
5951             const int q1 = pix[1];
5952             const int q2 = pix[2];
5953
5954             if( FFABS( p0 - q0 ) < alpha &&
5955                 FFABS( p1 - p0 ) < beta &&
5956                 FFABS( q1 - q0 ) < beta ) {
5957                 int tc = tc0;
5958                 int i_delta;
5959
5960                 if( FFABS( p2 - p0 ) < beta ) {
5961                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5962                     tc++;
5963                 }
5964                 if( FFABS( q2 - q0 ) < beta ) {
5965                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5966                     tc++;
5967                 }
5968
5969                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5970                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5971                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5972                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5973             }
5974         }else{
5975             const int p0 = pix[-1];
5976             const int p1 = pix[-2];
5977             const int p2 = pix[-3];
5978
5979             const int q0 = pix[0];
5980             const int q1 = pix[1];
5981             const int q2 = pix[2];
5982
5983             if( FFABS( p0 - q0 ) < alpha &&
5984                 FFABS( p1 - p0 ) < beta &&
5985                 FFABS( q1 - q0 ) < beta ) {
5986
5987                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5988                     if( FFABS( p2 - p0 ) < beta)
5989                     {
5990                         const int p3 = pix[-4];
5991                         /* p0', p1', p2' */
5992                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5993                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5994                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5995                     } else {
5996                         /* p0' */
5997                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5998                     }
5999                     if( FFABS( q2 - q0 ) < beta)
6000                     {
6001                         const int q3 = pix[3];
6002                         /* q0', q1', q2' */
6003                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6004                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6005                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6006                     } else {
6007                         /* q0' */
6008                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6009                     }
6010                 }else{
6011                     /* p0', q0' */
6012                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6013                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6014                 }
6015                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6016             }
6017         }
6018     }
6019 }
6020 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6021     int i;
6022     for( i = 0; i < 8; i++, pix += stride) {
6023         int index_a;
6024         int alpha;
6025         int beta;
6026
6027         int qp_index;
6028         int bS_index = i;
6029
6030         if( bS[bS_index] == 0 ) {
6031             continue;
6032         }
6033
6034         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6035         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6036         alpha = (alpha_table+52)[index_a];
6037         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6038
6039         if( bS[bS_index] < 4 ) {
6040             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6041             const int p0 = pix[-1];
6042             const int p1 = pix[-2];
6043             const int q0 = pix[0];
6044             const int q1 = pix[1];
6045
6046             if( FFABS( p0 - q0 ) < alpha &&
6047                 FFABS( p1 - p0 ) < beta &&
6048                 FFABS( q1 - q0 ) < beta ) {
6049                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6050
6051                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6052                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6053                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6054             }
6055         }else{
6056             const int p0 = pix[-1];
6057             const int p1 = pix[-2];
6058             const int q0 = pix[0];
6059             const int q1 = pix[1];
6060
6061             if( FFABS( p0 - q0 ) < alpha &&
6062                 FFABS( p1 - p0 ) < beta &&
6063                 FFABS( q1 - q0 ) < beta ) {
6064
6065                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6066                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6067                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6068             }
6069         }
6070     }
6071 }
6072
6073 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6074     int i, d;
6075     const int index_a = qp + h->slice_alpha_c0_offset;
6076     const int alpha = (alpha_table+52)[index_a];
6077     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6078     const int pix_next  = stride;
6079
6080     if( bS[0] < 4 ) {
6081         int8_t tc[4];
6082         for(i=0; i<4; i++)
6083             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6084         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6085     } else {
6086         /* 16px edge length, see filter_mb_edgev */
6087             for( d = 0; d < 16; d++ ) {
6088                 const int p0 = pix[-1*pix_next];
6089                 const int p1 = pix[-2*pix_next];
6090                 const int p2 = pix[-3*pix_next];
6091                 const int q0 = pix[0];
6092                 const int q1 = pix[1*pix_next];
6093                 const int q2 = pix[2*pix_next];
6094
6095                 if( FFABS( p0 - q0 ) < alpha &&
6096                     FFABS( p1 - p0 ) < beta &&
6097                     FFABS( q1 - q0 ) < beta ) {
6098
6099                     const int p3 = pix[-4*pix_next];
6100                     const int q3 = pix[ 3*pix_next];
6101
6102                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6103                         if( FFABS( p2 - p0 ) < beta) {
6104                             /* p0', p1', p2' */
6105                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6106                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6107                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6108                         } else {
6109                             /* p0' */
6110                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6111                         }
6112                         if( FFABS( q2 - q0 ) < beta) {
6113                             /* q0', q1', q2' */
6114                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6115                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6116                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6117                         } else {
6118                             /* q0' */
6119                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6120                         }
6121                     }else{
6122                         /* p0', q0' */
6123                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6124                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6125                     }
6126                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6127                 }
6128                 pix++;
6129             }
6130     }
6131 }
6132
6133 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6134     int i;
6135     const int index_a = qp + h->slice_alpha_c0_offset;
6136     const int alpha = (alpha_table+52)[index_a];
6137     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6138
6139     if( bS[0] < 4 ) {
6140         int8_t tc[4];
6141         for(i=0; i<4; i++)
6142             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6143         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6144     } else {
6145         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6146     }
6147 }
6148
6149 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6150     MpegEncContext * const s = &h->s;
6151     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6152     int mb_xy, mb_type;
6153     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6154
6155     mb_xy = h->mb_xy;
6156
6157     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6158         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6159        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6160                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6161         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6162         return;
6163     }
6164     assert(!FRAME_MBAFF);
6165
6166     mb_type = s->current_picture.mb_type[mb_xy];
6167     qp = s->current_picture.qscale_table[mb_xy];
6168     qp0 = s->current_picture.qscale_table[mb_xy-1];
6169     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6170     qpc = get_chroma_qp( h, 0, qp );
6171     qpc0 = get_chroma_qp( h, 0, qp0 );
6172     qpc1 = get_chroma_qp( h, 0, qp1 );
6173     qp0 = (qp + qp0 + 1) >> 1;
6174     qp1 = (qp + qp1 + 1) >> 1;
6175     qpc0 = (qpc + qpc0 + 1) >> 1;
6176     qpc1 = (qpc + qpc1 + 1) >> 1;
6177     qp_thresh = 15 - h->slice_alpha_c0_offset;
6178     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6179        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6180         return;
6181
6182     if( IS_INTRA(mb_type) ) {
6183         int16_t bS4[4] = {4,4,4,4};
6184         int16_t bS3[4] = {3,3,3,3};
6185         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6186         if( IS_8x8DCT(mb_type) ) {
6187             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6188             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6189             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6190             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6191         } else {
6192             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6193             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6194             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6195             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6196             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6197             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6198             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6199             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6200         }
6201         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6202         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6203         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6204         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6205         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6206         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6207         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6208         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6209         return;
6210     } else {
6211         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6212         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6213         int edges;
6214         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6215             edges = 4;
6216             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6217         } else {
6218             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6219                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6220             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6221                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6222                              ? 3 : 0;
6223             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6224             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6225             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6226                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6227         }
6228         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6229             bSv[0][0] = 0x0004000400040004ULL;
6230         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6231             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6232
6233 #define FILTER(hv,dir,edge)\
6234         if(bSv[dir][edge]) {\
6235             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6236             if(!(edge&1)) {\
6237                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6238                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6239             }\
6240         }
6241         if( edges == 1 ) {
6242             FILTER(v,0,0);
6243             FILTER(h,1,0);
6244         } else if( IS_8x8DCT(mb_type) ) {
6245             FILTER(v,0,0);
6246             FILTER(v,0,2);
6247             FILTER(h,1,0);
6248             FILTER(h,1,2);
6249         } else {
6250             FILTER(v,0,0);
6251             FILTER(v,0,1);
6252             FILTER(v,0,2);
6253             FILTER(v,0,3);
6254             FILTER(h,1,0);
6255             FILTER(h,1,1);
6256             FILTER(h,1,2);
6257             FILTER(h,1,3);
6258         }
6259 #undef FILTER
6260     }
6261 }
6262
6263
6264 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6265     MpegEncContext * const s = &h->s;
6266     int edge;
6267     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6268     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6269     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6270     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6271     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6272
6273     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6274                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6275     // how often to recheck mv-based bS when iterating between edges
6276     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6277                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6278     // how often to recheck mv-based bS when iterating along each edge
6279     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6280
6281     if (first_vertical_edge_done) {
6282         start = 1;
6283     }
6284
6285     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6286         start = 1;
6287
6288     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6289         && !IS_INTERLACED(mb_type)
6290         && IS_INTERLACED(mbm_type)
6291         ) {
6292         // This is a special case in the norm where the filtering must
6293         // be done twice (one each of the field) even if we are in a
6294         // frame macroblock.
6295         //
6296         static const int nnz_idx[4] = {4,5,6,3};
6297         unsigned int tmp_linesize   = 2 *   linesize;
6298         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6299         int mbn_xy = mb_xy - 2 * s->mb_stride;
6300         int qp;
6301         int i, j;
6302         int16_t bS[4];
6303
6304         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6305             if( IS_INTRA(mb_type) ||
6306                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6307                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6308             } else {
6309                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6310                 for( i = 0; i < 4; i++ ) {
6311                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6312                         mbn_nnz[nnz_idx[i]] != 0 )
6313                         bS[i] = 2;
6314                     else
6315                         bS[i] = 1;
6316                 }
6317             }
6318             // Do not use s->qscale as luma quantizer because it has not the same
6319             // value in IPCM macroblocks.
6320             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6321             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6322             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6323             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6324             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6325                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6326             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6327                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6328         }
6329
6330         start = 1;
6331     }
6332
6333     /* Calculate bS */
6334     for( edge = start; edge < edges; edge++ ) {
6335         /* mbn_xy: neighbor macroblock */
6336         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6337         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6338         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6339         int16_t bS[4];
6340         int qp;
6341
6342         if( (edge&1) && IS_8x8DCT(mb_type) )
6343             continue;
6344
6345         if( IS_INTRA(mb_type) ||
6346             IS_INTRA(mbn_type) ) {
6347             int value;
6348             if (edge == 0) {
6349                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6350                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6351                 ) {
6352                     value = 4;
6353                 } else {
6354                     value = 3;
6355                 }
6356             } else {
6357                 value = 3;
6358             }
6359             bS[0] = bS[1] = bS[2] = bS[3] = value;
6360         } else {
6361             int i, l;
6362             int mv_done;
6363
6364             if( edge & mask_edge ) {
6365                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6366                 mv_done = 1;
6367             }
6368             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6369                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6370                 mv_done = 1;
6371             }
6372             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6373                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6374                 int bn_idx= b_idx - (dir ? 8:1);
6375                 int v = 0;
6376
6377                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6378                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6379                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6380                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6381                 }
6382
6383                 if(h->slice_type_nos == FF_B_TYPE && v){
6384                     v=0;
6385                     for( l = 0; !v && l < 2; l++ ) {
6386                         int ln= 1-l;
6387                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6388                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6389                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6390                     }
6391                 }
6392
6393                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6394                 mv_done = 1;
6395             }
6396             else
6397                 mv_done = 0;
6398
6399             for( i = 0; i < 4; i++ ) {
6400                 int x = dir == 0 ? edge : i;
6401                 int y = dir == 0 ? i    : edge;
6402                 int b_idx= 8 + 4 + x + 8*y;
6403                 int bn_idx= b_idx - (dir ? 8:1);
6404
6405                 if( h->non_zero_count_cache[b_idx] |
6406                     h->non_zero_count_cache[bn_idx] ) {
6407                     bS[i] = 2;
6408                 }
6409                 else if(!mv_done)
6410                 {
6411                     bS[i] = 0;
6412                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6413                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6414                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6415                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6416                             bS[i] = 1;
6417                             break;
6418                         }
6419                     }
6420
6421                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6422                         bS[i] = 0;
6423                         for( l = 0; l < 2; l++ ) {
6424                             int ln= 1-l;
6425                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6426                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6427                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6428                                 bS[i] = 1;
6429                                 break;
6430                             }
6431                         }
6432                     }
6433                 }
6434             }
6435
6436             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6437                 continue;
6438         }
6439
6440         /* Filter edge */
6441         // Do not use s->qscale as luma quantizer because it has not the same
6442         // value in IPCM macroblocks.
6443         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6444         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6445         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6446         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6447         if( dir == 0 ) {
6448             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6449             if( (edge&1) == 0 ) {
6450                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6451                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6452                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6453                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6454             }
6455         } else {
6456             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6457             if( (edge&1) == 0 ) {
6458                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6459                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6460                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6461                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6462             }
6463         }
6464     }
6465 }
6466
6467 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6468     MpegEncContext * const s = &h->s;
6469     const int mb_xy= mb_x + mb_y*s->mb_stride;
6470     const int mb_type = s->current_picture.mb_type[mb_xy];
6471     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6472     int first_vertical_edge_done = 0;
6473     int dir;
6474
6475     //for sufficiently low qp, filtering wouldn't do anything
6476     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6477     if(!FRAME_MBAFF){
6478         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6479         int qp = s->current_picture.qscale_table[mb_xy];
6480         if(qp <= qp_thresh
6481            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6482            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6483             return;
6484         }
6485     }
6486
6487     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6488     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6489         int top_type, left_type[2];
6490         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6491         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6492         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6493
6494         if(IS_8x8DCT(top_type)){
6495             h->non_zero_count_cache[4+8*0]=
6496             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6497             h->non_zero_count_cache[6+8*0]=
6498             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6499         }
6500         if(IS_8x8DCT(left_type[0])){
6501             h->non_zero_count_cache[3+8*1]=
6502             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6503         }
6504         if(IS_8x8DCT(left_type[1])){
6505             h->non_zero_count_cache[3+8*3]=
6506             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6507         }
6508
6509         if(IS_8x8DCT(mb_type)){
6510             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6511             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6512
6513             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6514             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6515
6516             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6517             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6518
6519             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6520             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6521         }
6522     }
6523
6524     if (FRAME_MBAFF
6525             // left mb is in picture
6526             && h->slice_table[mb_xy-1] != 0xFFFF
6527             // and current and left pair do not have the same interlaced type
6528             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6529             // and left mb is in the same slice if deblocking_filter == 2
6530             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6531         /* First vertical edge is different in MBAFF frames
6532          * There are 8 different bS to compute and 2 different Qp
6533          */
6534         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6535         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6536         int16_t bS[8];
6537         int qp[2];
6538         int bqp[2];
6539         int rqp[2];
6540         int mb_qp, mbn0_qp, mbn1_qp;
6541         int i;
6542         first_vertical_edge_done = 1;
6543
6544         if( IS_INTRA(mb_type) )
6545             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6546         else {
6547             for( i = 0; i < 8; i++ ) {
6548                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6549
6550                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6551                     bS[i] = 4;
6552                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6553                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6554                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6555                                                                        :
6556                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6557                     bS[i] = 2;
6558                 else
6559                     bS[i] = 1;
6560             }
6561         }
6562
6563         mb_qp = s->current_picture.qscale_table[mb_xy];
6564         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6565         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6566         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6567         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6568                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6569         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6570                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6571         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6572         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6573                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6574         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6575                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6576
6577         /* Filter edge */
6578         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6579         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6580         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6581         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6582         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6583     }
6584
6585 #ifdef CONFIG_SMALL
6586     for( dir = 0; dir < 2; dir++ )
6587         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6588 #else
6589     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6590     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6591 #endif
6592 }
6593
6594 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6595     H264Context *h = *(void**)arg;
6596     MpegEncContext * const s = &h->s;
6597     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6598
6599     s->mb_skip_run= -1;
6600
6601     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6602                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6603
6604     if( h->pps.cabac ) {
6605         int i;
6606
6607         /* realign */
6608         align_get_bits( &s->gb );
6609
6610         /* init cabac */
6611         ff_init_cabac_states( &h->cabac);
6612         ff_init_cabac_decoder( &h->cabac,
6613                                s->gb.buffer + get_bits_count(&s->gb)/8,
6614                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6615         /* calculate pre-state */
6616         for( i= 0; i < 460; i++ ) {
6617             int pre;
6618             if( h->slice_type_nos == FF_I_TYPE )
6619                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6620             else
6621                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6622
6623             if( pre <= 63 )
6624                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6625             else
6626                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6627         }
6628
6629         for(;;){
6630 //START_TIMER
6631             int ret = decode_mb_cabac(h);
6632             int eos;
6633 //STOP_TIMER("decode_mb_cabac")
6634
6635             if(ret>=0) hl_decode_mb(h);
6636
6637             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6638                 s->mb_y++;
6639
6640                 if(ret>=0) ret = decode_mb_cabac(h);
6641
6642                 if(ret>=0) hl_decode_mb(h);
6643                 s->mb_y--;
6644             }
6645             eos = get_cabac_terminate( &h->cabac );
6646
6647             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6648                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6649                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6650                 return -1;
6651             }
6652
6653             if( ++s->mb_x >= s->mb_width ) {
6654                 s->mb_x = 0;
6655                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6656                 ++s->mb_y;
6657                 if(FIELD_OR_MBAFF_PICTURE) {
6658                     ++s->mb_y;
6659                 }
6660             }
6661
6662             if( eos || s->mb_y >= s->mb_height ) {
6663                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6664                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6665                 return 0;
6666             }
6667         }
6668
6669     } else {
6670         for(;;){
6671             int ret = decode_mb_cavlc(h);
6672
6673             if(ret>=0) hl_decode_mb(h);
6674
6675             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6676                 s->mb_y++;
6677                 ret = decode_mb_cavlc(h);
6678
6679                 if(ret>=0) hl_decode_mb(h);
6680                 s->mb_y--;
6681             }
6682
6683             if(ret<0){
6684                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6685                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6686
6687                 return -1;
6688             }
6689
6690             if(++s->mb_x >= s->mb_width){
6691                 s->mb_x=0;
6692                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6693                 ++s->mb_y;
6694                 if(FIELD_OR_MBAFF_PICTURE) {
6695                     ++s->mb_y;
6696                 }
6697                 if(s->mb_y >= s->mb_height){
6698                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6699
6700                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6701                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6702
6703                         return 0;
6704                     }else{
6705                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6706
6707                         return -1;
6708                     }
6709                 }
6710             }
6711
6712             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6713                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6714                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6715                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6716
6717                     return 0;
6718                 }else{
6719                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6720
6721                     return -1;
6722                 }
6723             }
6724         }
6725     }
6726
6727 #if 0
6728     for(;s->mb_y < s->mb_height; s->mb_y++){
6729         for(;s->mb_x < s->mb_width; s->mb_x++){
6730             int ret= decode_mb(h);
6731
6732             hl_decode_mb(h);
6733
6734             if(ret<0){
6735                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6736                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6737
6738                 return -1;
6739             }
6740
6741             if(++s->mb_x >= s->mb_width){
6742                 s->mb_x=0;
6743                 if(++s->mb_y >= s->mb_height){
6744                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6745                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6746
6747                         return 0;
6748                     }else{
6749                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6750
6751                         return -1;
6752                     }
6753                 }
6754             }
6755
6756             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6757                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6758                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6759
6760                     return 0;
6761                 }else{
6762                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6763
6764                     return -1;
6765                 }
6766             }
6767         }
6768         s->mb_x=0;
6769         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6770     }
6771 #endif
6772     return -1; //not reached
6773 }
6774
6775 static int decode_picture_timing(H264Context *h){
6776     MpegEncContext * const s = &h->s;
6777     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6778         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6779         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6780     }
6781     if(h->sps.pic_struct_present_flag){
6782         unsigned int i, num_clock_ts;
6783         h->sei_pic_struct = get_bits(&s->gb, 4);
6784
6785         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6786             return -1;
6787
6788         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6789
6790         for (i = 0 ; i < num_clock_ts ; i++){
6791             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6792                 unsigned int full_timestamp_flag;
6793                 skip_bits(&s->gb, 2);                 /* ct_type */
6794                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6795                 skip_bits(&s->gb, 5);                 /* counting_type */
6796                 full_timestamp_flag = get_bits(&s->gb, 1);
6797                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6798                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6799                 skip_bits(&s->gb, 8);                 /* n_frames */
6800                 if(full_timestamp_flag){
6801                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6802                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6803                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6804                 }else{
6805                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6806                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6807                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6808                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6809                             if(get_bits(&s->gb, 1))   /* hours_flag */
6810                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6811                         }
6812                     }
6813                 }
6814                 if(h->sps.time_offset_length > 0)
6815                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6816             }
6817         }
6818     }
6819     return 0;
6820 }
6821
6822 static int decode_unregistered_user_data(H264Context *h, int size){
6823     MpegEncContext * const s = &h->s;
6824     uint8_t user_data[16+256];
6825     int e, build, i;
6826
6827     if(size<16)
6828         return -1;
6829
6830     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6831         user_data[i]= get_bits(&s->gb, 8);
6832     }
6833
6834     user_data[i]= 0;
6835     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6836     if(e==1 && build>=0)
6837         h->x264_build= build;
6838
6839     if(s->avctx->debug & FF_DEBUG_BUGS)
6840         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6841
6842     for(; i<size; i++)
6843         skip_bits(&s->gb, 8);
6844
6845     return 0;
6846 }
6847
6848 static int decode_sei(H264Context *h){
6849     MpegEncContext * const s = &h->s;
6850
6851     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6852         int size, type;
6853
6854         type=0;
6855         do{
6856             type+= show_bits(&s->gb, 8);
6857         }while(get_bits(&s->gb, 8) == 255);
6858
6859         size=0;
6860         do{
6861             size+= show_bits(&s->gb, 8);
6862         }while(get_bits(&s->gb, 8) == 255);
6863
6864         switch(type){
6865         case 1: // Picture timing SEI
6866             if(decode_picture_timing(h) < 0)
6867                 return -1;
6868             break;
6869         case 5:
6870             if(decode_unregistered_user_data(h, size) < 0)
6871                 return -1;
6872             break;
6873         default:
6874             skip_bits(&s->gb, 8*size);
6875         }
6876
6877         //FIXME check bits here
6878         align_get_bits(&s->gb);
6879     }
6880
6881     return 0;
6882 }
6883
6884 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6885     MpegEncContext * const s = &h->s;
6886     int cpb_count, i;
6887     cpb_count = get_ue_golomb(&s->gb) + 1;
6888
6889     if(cpb_count > 32U){
6890         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6891         return -1;
6892     }
6893
6894     get_bits(&s->gb, 4); /* bit_rate_scale */
6895     get_bits(&s->gb, 4); /* cpb_size_scale */
6896     for(i=0; i<cpb_count; i++){
6897         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6898         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6899         get_bits1(&s->gb);     /* cbr_flag */
6900     }
6901     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6902     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6903     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6904     sps->time_offset_length = get_bits(&s->gb, 5);
6905     return 0;
6906 }
6907
6908 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6909     MpegEncContext * const s = &h->s;
6910     int aspect_ratio_info_present_flag;
6911     unsigned int aspect_ratio_idc;
6912
6913     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6914
6915     if( aspect_ratio_info_present_flag ) {
6916         aspect_ratio_idc= get_bits(&s->gb, 8);
6917         if( aspect_ratio_idc == EXTENDED_SAR ) {
6918             sps->sar.num= get_bits(&s->gb, 16);
6919             sps->sar.den= get_bits(&s->gb, 16);
6920         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6921             sps->sar=  pixel_aspect[aspect_ratio_idc];
6922         }else{
6923             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6924             return -1;
6925         }
6926     }else{
6927         sps->sar.num=
6928         sps->sar.den= 0;
6929     }
6930 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6931
6932     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6933         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6934     }
6935
6936     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6937         get_bits(&s->gb, 3);    /* video_format */
6938         get_bits1(&s->gb);      /* video_full_range_flag */
6939         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6940             get_bits(&s->gb, 8); /* colour_primaries */
6941             get_bits(&s->gb, 8); /* transfer_characteristics */
6942             get_bits(&s->gb, 8); /* matrix_coefficients */
6943         }
6944     }
6945
6946     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6947         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6948         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6949     }
6950
6951     sps->timing_info_present_flag = get_bits1(&s->gb);
6952     if(sps->timing_info_present_flag){
6953         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6954         sps->time_scale = get_bits_long(&s->gb, 32);
6955         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6956     }
6957
6958     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6959     if(sps->nal_hrd_parameters_present_flag)
6960         if(decode_hrd_parameters(h, sps) < 0)
6961             return -1;
6962     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6963     if(sps->vcl_hrd_parameters_present_flag)
6964         if(decode_hrd_parameters(h, sps) < 0)
6965             return -1;
6966     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6967         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6968     sps->pic_struct_present_flag = get_bits1(&s->gb);
6969
6970     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6971     if(sps->bitstream_restriction_flag){
6972         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6973         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6974         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6975         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6976         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6977         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6978         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6979
6980         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6981             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6982             return -1;
6983         }
6984     }
6985
6986     return 0;
6987 }
6988
6989 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6990                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6991     MpegEncContext * const s = &h->s;
6992     int i, last = 8, next = 8;
6993     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6994     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6995         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6996     else
6997     for(i=0;i<size;i++){
6998         if(next)
6999             next = (last + get_se_golomb(&s->gb)) & 0xff;
7000         if(!i && !next){ /* matrix not written, we use the preset one */
7001             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7002             break;
7003         }
7004         last = factors[scan[i]] = next ? next : last;
7005     }
7006 }
7007
7008 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7009                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7010     MpegEncContext * const s = &h->s;
7011     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7012     const uint8_t *fallback[4] = {
7013         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7014         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7015         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7016         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7017     };
7018     if(get_bits1(&s->gb)){
7019         sps->scaling_matrix_present |= is_sps;
7020         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7021         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7022         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7023         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7024         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7025         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7026         if(is_sps || pps->transform_8x8_mode){
7027             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7028             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7029         }
7030     }
7031 }
7032
7033 static inline int decode_seq_parameter_set(H264Context *h){
7034     MpegEncContext * const s = &h->s;
7035     int profile_idc, level_idc;
7036     unsigned int sps_id;
7037     int i;
7038     SPS *sps;
7039
7040     profile_idc= get_bits(&s->gb, 8);
7041     get_bits1(&s->gb);   //constraint_set0_flag
7042     get_bits1(&s->gb);   //constraint_set1_flag
7043     get_bits1(&s->gb);   //constraint_set2_flag
7044     get_bits1(&s->gb);   //constraint_set3_flag
7045     get_bits(&s->gb, 4); // reserved
7046     level_idc= get_bits(&s->gb, 8);
7047     sps_id= get_ue_golomb(&s->gb);
7048
7049     if(sps_id >= MAX_SPS_COUNT) {
7050         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7051         return -1;
7052     }
7053     sps= av_mallocz(sizeof(SPS));
7054     if(sps == NULL)
7055         return -1;
7056
7057     sps->profile_idc= profile_idc;
7058     sps->level_idc= level_idc;
7059
7060     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7061     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7062     sps->scaling_matrix_present = 0;
7063
7064     if(sps->profile_idc >= 100){ //high profile
7065         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7066         if(sps->chroma_format_idc == 3)
7067             get_bits1(&s->gb);  //residual_color_transform_flag
7068         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7069         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7070         sps->transform_bypass = get_bits1(&s->gb);
7071         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7072     }else{
7073         sps->chroma_format_idc= 1;
7074     }
7075
7076     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7077     sps->poc_type= get_ue_golomb(&s->gb);
7078
7079     if(sps->poc_type == 0){ //FIXME #define
7080         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7081     } else if(sps->poc_type == 1){//FIXME #define
7082         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7083         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7084         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7085         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7086
7087         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7088             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7089             goto fail;
7090         }
7091
7092         for(i=0; i<sps->poc_cycle_length; i++)
7093             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7094     }else if(sps->poc_type != 2){
7095         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7096         goto fail;
7097     }
7098
7099     sps->ref_frame_count= get_ue_golomb(&s->gb);
7100     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7101         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7102         goto fail;
7103     }
7104     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7105     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7106     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7107     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7108        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7109         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7110         goto fail;
7111     }
7112
7113     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7114     if(!sps->frame_mbs_only_flag)
7115         sps->mb_aff= get_bits1(&s->gb);
7116     else
7117         sps->mb_aff= 0;
7118
7119     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7120
7121 #ifndef ALLOW_INTERLACE
7122     if(sps->mb_aff)
7123         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7124 #endif
7125     sps->crop= get_bits1(&s->gb);
7126     if(sps->crop){
7127         sps->crop_left  = get_ue_golomb(&s->gb);
7128         sps->crop_right = get_ue_golomb(&s->gb);
7129         sps->crop_top   = get_ue_golomb(&s->gb);
7130         sps->crop_bottom= get_ue_golomb(&s->gb);
7131         if(sps->crop_left || sps->crop_top){
7132             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7133         }
7134         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7135             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7136         }
7137     }else{
7138         sps->crop_left  =
7139         sps->crop_right =
7140         sps->crop_top   =
7141         sps->crop_bottom= 0;
7142     }
7143
7144     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7145     if( sps->vui_parameters_present_flag )
7146         decode_vui_parameters(h, sps);
7147
7148     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7149         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7150                sps_id, sps->profile_idc, sps->level_idc,
7151                sps->poc_type,
7152                sps->ref_frame_count,
7153                sps->mb_width, sps->mb_height,
7154                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7155                sps->direct_8x8_inference_flag ? "8B8" : "",
7156                sps->crop_left, sps->crop_right,
7157                sps->crop_top, sps->crop_bottom,
7158                sps->vui_parameters_present_flag ? "VUI" : "",
7159                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7160                );
7161     }
7162     av_free(h->sps_buffers[sps_id]);
7163     h->sps_buffers[sps_id]= sps;
7164     return 0;
7165 fail:
7166     av_free(sps);
7167     return -1;
7168 }
7169
7170 static void
7171 build_qp_table(PPS *pps, int t, int index)
7172 {
7173     int i;
7174     for(i = 0; i < 52; i++)
7175         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7176 }
7177
7178 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7179     MpegEncContext * const s = &h->s;
7180     unsigned int pps_id= get_ue_golomb(&s->gb);
7181     PPS *pps;
7182
7183     if(pps_id >= MAX_PPS_COUNT) {
7184         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7185         return -1;
7186     }
7187
7188     pps= av_mallocz(sizeof(PPS));
7189     if(pps == NULL)
7190         return -1;
7191     pps->sps_id= get_ue_golomb(&s->gb);
7192     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7193         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7194         goto fail;
7195     }
7196
7197     pps->cabac= get_bits1(&s->gb);
7198     pps->pic_order_present= get_bits1(&s->gb);
7199     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7200     if(pps->slice_group_count > 1 ){
7201         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7202         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7203         switch(pps->mb_slice_group_map_type){
7204         case 0:
7205 #if 0
7206 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7207 |    run_length[ i ]                                |1  |ue(v)   |
7208 #endif
7209             break;
7210         case 2:
7211 #if 0
7212 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7213 |{                                                  |   |        |
7214 |    top_left_mb[ i ]                               |1  |ue(v)   |
7215 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7216 |   }                                               |   |        |
7217 #endif
7218             break;
7219         case 3:
7220         case 4:
7221         case 5:
7222 #if 0
7223 |   slice_group_change_direction_flag               |1  |u(1)    |
7224 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7225 #endif
7226             break;
7227         case 6:
7228 #if 0
7229 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7230 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7231 |)                                                  |   |        |
7232 |    slice_group_id[ i ]                            |1  |u(v)    |
7233 #endif
7234             break;
7235         }
7236     }
7237     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7238     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7239     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7240         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7241         goto fail;
7242     }
7243
7244     pps->weighted_pred= get_bits1(&s->gb);
7245     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7246     pps->init_qp= get_se_golomb(&s->gb) + 26;
7247     pps->init_qs= get_se_golomb(&s->gb) + 26;
7248     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7249     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7250     pps->constrained_intra_pred= get_bits1(&s->gb);
7251     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7252
7253     pps->transform_8x8_mode= 0;
7254     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7255     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7256     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7257
7258     if(get_bits_count(&s->gb) < bit_length){
7259         pps->transform_8x8_mode= get_bits1(&s->gb);
7260         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7261         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7262     } else {
7263         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7264     }
7265
7266     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7267     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7268     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7269         h->pps.chroma_qp_diff= 1;
7270
7271     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7272         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7273                pps_id, pps->sps_id,
7274                pps->cabac ? "CABAC" : "CAVLC",
7275                pps->slice_group_count,
7276                pps->ref_count[0], pps->ref_count[1],
7277                pps->weighted_pred ? "weighted" : "",
7278                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7279                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7280                pps->constrained_intra_pred ? "CONSTR" : "",
7281                pps->redundant_pic_cnt_present ? "REDU" : "",
7282                pps->transform_8x8_mode ? "8x8DCT" : ""
7283                );
7284     }
7285
7286     av_free(h->pps_buffers[pps_id]);
7287     h->pps_buffers[pps_id]= pps;
7288     return 0;
7289 fail:
7290     av_free(pps);
7291     return -1;
7292 }
7293
7294 /**
7295  * Call decode_slice() for each context.
7296  *
7297  * @param h h264 master context
7298  * @param context_count number of contexts to execute
7299  */
7300 static void execute_decode_slices(H264Context *h, int context_count){
7301     MpegEncContext * const s = &h->s;
7302     AVCodecContext * const avctx= s->avctx;
7303     H264Context *hx;
7304     int i;
7305
7306     if(context_count == 1) {
7307         decode_slice(avctx, &h);
7308     } else {
7309         for(i = 1; i < context_count; i++) {
7310             hx = h->thread_context[i];
7311             hx->s.error_recognition = avctx->error_recognition;
7312             hx->s.error_count = 0;
7313         }
7314
7315         avctx->execute(avctx, (void *)decode_slice,
7316                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7317
7318         /* pull back stuff from slices to master context */
7319         hx = h->thread_context[context_count - 1];
7320         s->mb_x = hx->s.mb_x;
7321         s->mb_y = hx->s.mb_y;
7322         s->dropable = hx->s.dropable;
7323         s->picture_structure = hx->s.picture_structure;
7324         for(i = 1; i < context_count; i++)
7325             h->s.error_count += h->thread_context[i]->s.error_count;
7326     }
7327 }
7328
7329
7330 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7331     MpegEncContext * const s = &h->s;
7332     AVCodecContext * const avctx= s->avctx;
7333     int buf_index=0;
7334     H264Context *hx; ///< thread context
7335     int context_count = 0;
7336
7337     h->max_contexts = avctx->thread_count;
7338 #if 0
7339     int i;
7340     for(i=0; i<50; i++){
7341         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7342     }
7343 #endif
7344     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7345         h->current_slice = 0;
7346         if (!s->first_field)
7347             s->current_picture_ptr= NULL;
7348     }
7349
7350     for(;;){
7351         int consumed;
7352         int dst_length;
7353         int bit_length;
7354         const uint8_t *ptr;
7355         int i, nalsize = 0;
7356         int err;
7357
7358         if(h->is_avc) {
7359             if(buf_index >= buf_size) break;
7360             nalsize = 0;
7361             for(i = 0; i < h->nal_length_size; i++)
7362                 nalsize = (nalsize << 8) | buf[buf_index++];
7363             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7364                 if(nalsize == 1){
7365                     buf_index++;
7366                     continue;
7367                 }else{
7368                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7369                     break;
7370                 }
7371             }
7372         } else {
7373             // start code prefix search
7374             for(; buf_index + 3 < buf_size; buf_index++){
7375                 // This should always succeed in the first iteration.
7376                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7377                     break;
7378             }
7379
7380             if(buf_index+3 >= buf_size) break;
7381
7382             buf_index+=3;
7383         }
7384
7385         hx = h->thread_context[context_count];
7386
7387         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7388         if (ptr==NULL || dst_length < 0){
7389             return -1;
7390         }
7391         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7392             dst_length--;
7393         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7394
7395         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7396             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7397         }
7398
7399         if (h->is_avc && (nalsize != consumed)){
7400             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7401             consumed= nalsize;
7402         }
7403
7404         buf_index += consumed;
7405
7406         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7407            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7408             continue;
7409
7410       again:
7411         err = 0;
7412         switch(hx->nal_unit_type){
7413         case NAL_IDR_SLICE:
7414             if (h->nal_unit_type != NAL_IDR_SLICE) {
7415                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7416                 return -1;
7417             }
7418             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7419         case NAL_SLICE:
7420             init_get_bits(&hx->s.gb, ptr, bit_length);
7421             hx->intra_gb_ptr=
7422             hx->inter_gb_ptr= &hx->s.gb;
7423             hx->s.data_partitioning = 0;
7424
7425             if((err = decode_slice_header(hx, h)))
7426                break;
7427
7428             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7429             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7430                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7431                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7432                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7433                && avctx->skip_frame < AVDISCARD_ALL)
7434                 context_count++;
7435             break;
7436         case NAL_DPA:
7437             init_get_bits(&hx->s.gb, ptr, bit_length);
7438             hx->intra_gb_ptr=
7439             hx->inter_gb_ptr= NULL;
7440             hx->s.data_partitioning = 1;
7441
7442             err = decode_slice_header(hx, h);
7443             break;
7444         case NAL_DPB:
7445             init_get_bits(&hx->intra_gb, ptr, bit_length);
7446             hx->intra_gb_ptr= &hx->intra_gb;
7447             break;
7448         case NAL_DPC:
7449             init_get_bits(&hx->inter_gb, ptr, bit_length);
7450             hx->inter_gb_ptr= &hx->inter_gb;
7451
7452             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7453                && s->context_initialized
7454                && s->hurry_up < 5
7455                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7456                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7457                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7458                && avctx->skip_frame < AVDISCARD_ALL)
7459                 context_count++;
7460             break;
7461         case NAL_SEI:
7462             init_get_bits(&s->gb, ptr, bit_length);
7463             decode_sei(h);
7464             break;
7465         case NAL_SPS:
7466             init_get_bits(&s->gb, ptr, bit_length);
7467             decode_seq_parameter_set(h);
7468
7469             if(s->flags& CODEC_FLAG_LOW_DELAY)
7470                 s->low_delay=1;
7471
7472             if(avctx->has_b_frames < 2)
7473                 avctx->has_b_frames= !s->low_delay;
7474             break;
7475         case NAL_PPS:
7476             init_get_bits(&s->gb, ptr, bit_length);
7477
7478             decode_picture_parameter_set(h, bit_length);
7479
7480             break;
7481         case NAL_AUD:
7482         case NAL_END_SEQUENCE:
7483         case NAL_END_STREAM:
7484         case NAL_FILLER_DATA:
7485         case NAL_SPS_EXT:
7486         case NAL_AUXILIARY_SLICE:
7487             break;
7488         default:
7489             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7490         }
7491
7492         if(context_count == h->max_contexts) {
7493             execute_decode_slices(h, context_count);
7494             context_count = 0;
7495         }
7496
7497         if (err < 0)
7498             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7499         else if(err == 1) {
7500             /* Slice could not be decoded in parallel mode, copy down
7501              * NAL unit stuff to context 0 and restart. Note that
7502              * rbsp_buffer is not transferred, but since we no longer
7503              * run in parallel mode this should not be an issue. */
7504             h->nal_unit_type = hx->nal_unit_type;
7505             h->nal_ref_idc   = hx->nal_ref_idc;
7506             hx = h;
7507             goto again;
7508         }
7509     }
7510     if(context_count)
7511         execute_decode_slices(h, context_count);
7512     return buf_index;
7513 }
7514
7515 /**
7516  * returns the number of bytes consumed for building the current frame
7517  */
7518 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7519         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7520         if(pos+10>buf_size) pos=buf_size; // oops ;)
7521
7522         return pos;
7523 }
7524
7525 static int decode_frame(AVCodecContext *avctx,
7526                              void *data, int *data_size,
7527                              const uint8_t *buf, int buf_size)
7528 {
7529     H264Context *h = avctx->priv_data;
7530     MpegEncContext *s = &h->s;
7531     AVFrame *pict = data;
7532     int buf_index;
7533
7534     s->flags= avctx->flags;
7535     s->flags2= avctx->flags2;
7536
7537    /* end of stream, output what is still in the buffers */
7538     if (buf_size == 0) {
7539         Picture *out;
7540         int i, out_idx;
7541
7542 //FIXME factorize this with the output code below
7543         out = h->delayed_pic[0];
7544         out_idx = 0;
7545         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7546             if(h->delayed_pic[i]->poc < out->poc){
7547                 out = h->delayed_pic[i];
7548                 out_idx = i;
7549             }
7550
7551         for(i=out_idx; h->delayed_pic[i]; i++)
7552             h->delayed_pic[i] = h->delayed_pic[i+1];
7553
7554         if(out){
7555             *data_size = sizeof(AVFrame);
7556             *pict= *(AVFrame*)out;
7557         }
7558
7559         return 0;
7560     }
7561
7562     if(h->is_avc && !h->got_avcC) {
7563         int i, cnt, nalsize;
7564         unsigned char *p = avctx->extradata;
7565         if(avctx->extradata_size < 7) {
7566             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7567             return -1;
7568         }
7569         if(*p != 1) {
7570             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7571             return -1;
7572         }
7573         /* sps and pps in the avcC always have length coded with 2 bytes,
7574            so put a fake nal_length_size = 2 while parsing them */
7575         h->nal_length_size = 2;
7576         // Decode sps from avcC
7577         cnt = *(p+5) & 0x1f; // Number of sps
7578         p += 6;
7579         for (i = 0; i < cnt; i++) {
7580             nalsize = AV_RB16(p) + 2;
7581             if(decode_nal_units(h, p, nalsize) < 0) {
7582                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7583                 return -1;
7584             }
7585             p += nalsize;
7586         }
7587         // Decode pps from avcC
7588         cnt = *(p++); // Number of pps
7589         for (i = 0; i < cnt; i++) {
7590             nalsize = AV_RB16(p) + 2;
7591             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7592                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7593                 return -1;
7594             }
7595             p += nalsize;
7596         }
7597         // Now store right nal length size, that will be use to parse all other nals
7598         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7599         // Do not reparse avcC
7600         h->got_avcC = 1;
7601     }
7602
7603     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7604         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7605             return -1;
7606         h->got_avcC = 1;
7607     }
7608
7609     buf_index=decode_nal_units(h, buf, buf_size);
7610     if(buf_index < 0)
7611         return -1;
7612
7613     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7614         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7615         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7616         return -1;
7617     }
7618
7619     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7620         Picture *out = s->current_picture_ptr;
7621         Picture *cur = s->current_picture_ptr;
7622         int i, pics, cross_idr, out_of_order, out_idx;
7623
7624         s->mb_y= 0;
7625
7626         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7627         s->current_picture_ptr->pict_type= s->pict_type;
7628
7629         if(!s->dropable) {
7630             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7631             h->prev_poc_msb= h->poc_msb;
7632             h->prev_poc_lsb= h->poc_lsb;
7633         }
7634         h->prev_frame_num_offset= h->frame_num_offset;
7635         h->prev_frame_num= h->frame_num;
7636
7637         /*
7638          * FIXME: Error handling code does not seem to support interlaced
7639          * when slices span multiple rows
7640          * The ff_er_add_slice calls don't work right for bottom
7641          * fields; they cause massive erroneous error concealing
7642          * Error marking covers both fields (top and bottom).
7643          * This causes a mismatched s->error_count
7644          * and a bad error table. Further, the error count goes to
7645          * INT_MAX when called for bottom field, because mb_y is
7646          * past end by one (callers fault) and resync_mb_y != 0
7647          * causes problems for the first MB line, too.
7648          */
7649         if (!FIELD_PICTURE)
7650             ff_er_frame_end(s);
7651
7652         MPV_frame_end(s);
7653
7654         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7655             /* Wait for second field. */
7656             *data_size = 0;
7657
7658         } else {
7659             cur->repeat_pict = 0;
7660
7661             /* Signal interlacing information externally. */
7662             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7663             if(h->sps.pic_struct_present_flag){
7664                 switch (h->sei_pic_struct)
7665                 {
7666                 case SEI_PIC_STRUCT_FRAME:
7667                     cur->interlaced_frame = 0;
7668                     break;
7669                 case SEI_PIC_STRUCT_TOP_FIELD:
7670                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7671                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7672                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7673                     cur->interlaced_frame = 1;
7674                     break;
7675                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7676                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7677                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7678                     // From these hints, let the applications decide if they apply deinterlacing.
7679                     cur->repeat_pict = 1;
7680                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7681                     break;
7682                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7683                     // Force progressive here, as doubling interlaced frame is a bad idea.
7684                     cur->interlaced_frame = 0;
7685                     cur->repeat_pict = 2;
7686                     break;
7687                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7688                     cur->interlaced_frame = 0;
7689                     cur->repeat_pict = 4;
7690                     break;
7691                 }
7692             }else{
7693                 /* Derive interlacing flag from used decoding process. */
7694                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7695             }
7696
7697             if (cur->field_poc[0] != cur->field_poc[1]){
7698                 /* Derive top_field_first from field pocs. */
7699                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7700             }else{
7701                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7702                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7703                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7704                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7705                         cur->top_field_first = 1;
7706                     else
7707                         cur->top_field_first = 0;
7708                 }else{
7709                     /* Most likely progressive */
7710                     cur->top_field_first = 0;
7711                 }
7712             }
7713
7714         //FIXME do something with unavailable reference frames
7715
7716             /* Sort B-frames into display order */
7717
7718             if(h->sps.bitstream_restriction_flag
7719                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7720                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7721                 s->low_delay = 0;
7722             }
7723
7724             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7725                && !h->sps.bitstream_restriction_flag){
7726                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7727                 s->low_delay= 0;
7728             }
7729
7730             pics = 0;
7731             while(h->delayed_pic[pics]) pics++;
7732
7733             assert(pics <= MAX_DELAYED_PIC_COUNT);
7734
7735             h->delayed_pic[pics++] = cur;
7736             if(cur->reference == 0)
7737                 cur->reference = DELAYED_PIC_REF;
7738
7739             out = h->delayed_pic[0];
7740             out_idx = 0;
7741             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7742                 if(h->delayed_pic[i]->poc < out->poc){
7743                     out = h->delayed_pic[i];
7744                     out_idx = i;
7745                 }
7746             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7747
7748             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7749
7750             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7751                 { }
7752             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7753                || (s->low_delay &&
7754                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7755                  || cur->pict_type == FF_B_TYPE)))
7756             {
7757                 s->low_delay = 0;
7758                 s->avctx->has_b_frames++;
7759             }
7760
7761             if(out_of_order || pics > s->avctx->has_b_frames){
7762                 out->reference &= ~DELAYED_PIC_REF;
7763                 for(i=out_idx; h->delayed_pic[i]; i++)
7764                     h->delayed_pic[i] = h->delayed_pic[i+1];
7765             }
7766             if(!out_of_order && pics > s->avctx->has_b_frames){
7767                 *data_size = sizeof(AVFrame);
7768
7769                 h->outputed_poc = out->poc;
7770                 *pict= *(AVFrame*)out;
7771             }else{
7772                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7773             }
7774         }
7775     }
7776
7777     assert(pict->data[0] || !*data_size);
7778     ff_print_debug_info(s, pict);
7779 //printf("out %d\n", (int)pict->data[0]);
7780 #if 0 //?
7781
7782     /* Return the Picture timestamp as the frame number */
7783     /* we subtract 1 because it is added on utils.c     */
7784     avctx->frame_number = s->picture_number - 1;
7785 #endif
7786     return get_consumed_bytes(s, buf_index, buf_size);
7787 }
7788 #if 0
7789 static inline void fill_mb_avail(H264Context *h){
7790     MpegEncContext * const s = &h->s;
7791     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7792
7793     if(s->mb_y){
7794         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7795         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7796         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7797     }else{
7798         h->mb_avail[0]=
7799         h->mb_avail[1]=
7800         h->mb_avail[2]= 0;
7801     }
7802     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7803     h->mb_avail[4]= 1; //FIXME move out
7804     h->mb_avail[5]= 0; //FIXME move out
7805 }
7806 #endif
7807
7808 #ifdef TEST
7809 #undef printf
7810 #undef random
7811 #define COUNT 8000
7812 #define SIZE (COUNT*40)
7813 int main(void){
7814     int i;
7815     uint8_t temp[SIZE];
7816     PutBitContext pb;
7817     GetBitContext gb;
7818 //    int int_temp[10000];
7819     DSPContext dsp;
7820     AVCodecContext avctx;
7821
7822     dsputil_init(&dsp, &avctx);
7823
7824     init_put_bits(&pb, temp, SIZE);
7825     printf("testing unsigned exp golomb\n");
7826     for(i=0; i<COUNT; i++){
7827         START_TIMER
7828         set_ue_golomb(&pb, i);
7829         STOP_TIMER("set_ue_golomb");
7830     }
7831     flush_put_bits(&pb);
7832
7833     init_get_bits(&gb, temp, 8*SIZE);
7834     for(i=0; i<COUNT; i++){
7835         int j, s;
7836
7837         s= show_bits(&gb, 24);
7838
7839         START_TIMER
7840         j= get_ue_golomb(&gb);
7841         if(j != i){
7842             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7843 //            return -1;
7844         }
7845         STOP_TIMER("get_ue_golomb");
7846     }
7847
7848
7849     init_put_bits(&pb, temp, SIZE);
7850     printf("testing signed exp golomb\n");
7851     for(i=0; i<COUNT; i++){
7852         START_TIMER
7853         set_se_golomb(&pb, i - COUNT/2);
7854         STOP_TIMER("set_se_golomb");
7855     }
7856     flush_put_bits(&pb);
7857
7858     init_get_bits(&gb, temp, 8*SIZE);
7859     for(i=0; i<COUNT; i++){
7860         int j, s;
7861
7862         s= show_bits(&gb, 24);
7863
7864         START_TIMER
7865         j= get_se_golomb(&gb);
7866         if(j != i - COUNT/2){
7867             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7868 //            return -1;
7869         }
7870         STOP_TIMER("get_se_golomb");
7871     }
7872
7873 #if 0
7874     printf("testing 4x4 (I)DCT\n");
7875
7876     DCTELEM block[16];
7877     uint8_t src[16], ref[16];
7878     uint64_t error= 0, max_error=0;
7879
7880     for(i=0; i<COUNT; i++){
7881         int j;
7882 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7883         for(j=0; j<16; j++){
7884             ref[j]= random()%255;
7885             src[j]= random()%255;
7886         }
7887
7888         h264_diff_dct_c(block, src, ref, 4);
7889
7890         //normalize
7891         for(j=0; j<16; j++){
7892 //            printf("%d ", block[j]);
7893             block[j]= block[j]*4;
7894             if(j&1) block[j]= (block[j]*4 + 2)/5;
7895             if(j&4) block[j]= (block[j]*4 + 2)/5;
7896         }
7897 //        printf("\n");
7898
7899         s->dsp.h264_idct_add(ref, block, 4);
7900 /*        for(j=0; j<16; j++){
7901             printf("%d ", ref[j]);
7902         }
7903         printf("\n");*/
7904
7905         for(j=0; j<16; j++){
7906             int diff= FFABS(src[j] - ref[j]);
7907
7908             error+= diff*diff;
7909             max_error= FFMAX(max_error, diff);
7910         }
7911     }
7912     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7913     printf("testing quantizer\n");
7914     for(qp=0; qp<52; qp++){
7915         for(i=0; i<16; i++)
7916             src1_block[i]= src2_block[i]= random()%255;
7917
7918     }
7919     printf("Testing NAL layer\n");
7920
7921     uint8_t bitstream[COUNT];
7922     uint8_t nal[COUNT*2];
7923     H264Context h;
7924     memset(&h, 0, sizeof(H264Context));
7925
7926     for(i=0; i<COUNT; i++){
7927         int zeros= i;
7928         int nal_length;
7929         int consumed;
7930         int out_length;
7931         uint8_t *out;
7932         int j;
7933
7934         for(j=0; j<COUNT; j++){
7935             bitstream[j]= (random() % 255) + 1;
7936         }
7937
7938         for(j=0; j<zeros; j++){
7939             int pos= random() % COUNT;
7940             while(bitstream[pos] == 0){
7941                 pos++;
7942                 pos %= COUNT;
7943             }
7944             bitstream[pos]=0;
7945         }
7946
7947         START_TIMER
7948
7949         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7950         if(nal_length<0){
7951             printf("encoding failed\n");
7952             return -1;
7953         }
7954
7955         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7956
7957         STOP_TIMER("NAL")
7958
7959         if(out_length != COUNT){
7960             printf("incorrect length %d %d\n", out_length, COUNT);
7961             return -1;
7962         }
7963
7964         if(consumed != nal_length){
7965             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7966             return -1;
7967         }
7968
7969         if(memcmp(bitstream, out, COUNT)){
7970             printf("mismatch\n");
7971             return -1;
7972         }
7973     }
7974 #endif
7975
7976     printf("Testing RBSP\n");
7977
7978
7979     return 0;
7980 }
7981 #endif /* TEST */
7982
7983
7984 static av_cold int decode_end(AVCodecContext *avctx)
7985 {
7986     H264Context *h = avctx->priv_data;
7987     MpegEncContext *s = &h->s;
7988     int i;
7989
7990     av_freep(&h->rbsp_buffer[0]);
7991     av_freep(&h->rbsp_buffer[1]);
7992     free_tables(h); //FIXME cleanup init stuff perhaps
7993
7994     for(i = 0; i < MAX_SPS_COUNT; i++)
7995         av_freep(h->sps_buffers + i);
7996
7997     for(i = 0; i < MAX_PPS_COUNT; i++)
7998         av_freep(h->pps_buffers + i);
7999
8000     MPV_common_end(s);
8001
8002 //    memset(h, 0, sizeof(H264Context));
8003
8004     return 0;
8005 }
8006
8007
8008 AVCodec h264_decoder = {
8009     "h264",
8010     CODEC_TYPE_VIDEO,
8011     CODEC_ID_H264,
8012     sizeof(H264Context),
8013     decode_init,
8014     NULL,
8015     decode_end,
8016     decode_frame,
8017     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8018     .flush= flush_dpb,
8019     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8020 };
8021
8022 #include "svq3.c"