git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1591                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1592                            int src_x_offset, int src_y_offset,
1593                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1594     MpegEncContext * const s = &h->s;
1595     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1596     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1597     const int luma_xy= (mx&3) + ((my&3)<<2);
1598     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1599     uint8_t * src_cb, * src_cr;
1600     int extra_width= h->emu_edge_width;
1601     int extra_height= h->emu_edge_height;
1602     int emu=0;
1603     const int full_mx= mx>>2;
1604     const int full_my= my>>2;
1605     const int pic_width  = 16*s->mb_width;
1606     const int pic_height = 16*s->mb_height >> MB_FIELD;
1607
1608     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1609         return;
1610
1611     if(mx&7) extra_width -= 3;
1612     if(my&7) extra_height -= 3;
1613
1614     if(   full_mx < 0-extra_width
1615        || full_my < 0-extra_height
1616        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1617        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1618         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1619             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1620         emu=1;
1621     }
1622
1623     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1624     if(!square){
1625         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1626     }
1627
1628     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1629
1630     if(MB_FIELD){
1631         // chroma offset when predicting from a field of opposite parity
1632         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1633         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1634     }
1635     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1637
1638     if(emu){
1639         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1640             src_cb= s->edge_emu_buffer;
1641     }
1642     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1643
1644     if(emu){
1645         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1646             src_cr= s->edge_emu_buffer;
1647     }
1648     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1649 }
1650
1651 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1652                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1653                            int x_offset, int y_offset,
1654                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1655                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1656                            int list0, int list1){
1657     MpegEncContext * const s = &h->s;
1658     qpel_mc_func *qpix_op=  qpix_put;
1659     h264_chroma_mc_func chroma_op= chroma_put;
1660
1661     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1662     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1664     x_offset += 8*s->mb_x;
1665     y_offset += 8*(s->mb_y >> MB_FIELD);
1666
1667     if(list0){
1668         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1669         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1670                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1671                            qpix_op, chroma_op);
1672
1673         qpix_op=  qpix_avg;
1674         chroma_op= chroma_avg;
1675     }
1676
1677     if(list1){
1678         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1679         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1680                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1681                            qpix_op, chroma_op);
1682     }
1683 }
1684
1685 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1686                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1687                            int x_offset, int y_offset,
1688                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1689                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1690                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1691                            int list0, int list1){
1692     MpegEncContext * const s = &h->s;
1693
1694     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1695     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1697     x_offset += 8*s->mb_x;
1698     y_offset += 8*(s->mb_y >> MB_FIELD);
1699
1700     if(list0 && list1){
1701         /* don't optimize for luma-only case, since B-frames usually
1702          * use implicit weights => chroma too. */
1703         uint8_t *tmp_cb = s->obmc_scratchpad;
1704         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1705         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1706         int refn0 = h->ref_cache[0][ scan8[n] ];
1707         int refn1 = h->ref_cache[1][ scan8[n] ];
1708
1709         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1710                     dest_y, dest_cb, dest_cr,
1711                     x_offset, y_offset, qpix_put, chroma_put);
1712         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1713                     tmp_y, tmp_cb, tmp_cr,
1714                     x_offset, y_offset, qpix_put, chroma_put);
1715
1716         if(h->use_weight == 2){
1717             int weight0 = h->implicit_weight[refn0][refn1];
1718             int weight1 = 64 - weight0;
1719             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1722         }else{
1723             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1724                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1725                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1726             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1727                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1728                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1729             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1730                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1731                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1732         }
1733     }else{
1734         int list = list1 ? 1 : 0;
1735         int refn = h->ref_cache[list][ scan8[n] ];
1736         Picture *ref= &h->ref_list[list][refn];
1737         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1738                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1739                     qpix_put, chroma_put);
1740
1741         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1742                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1743         if(h->use_weight_chroma){
1744             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1745                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1746             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1747                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1748         }
1749     }
1750 }
1751
1752 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1753                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1754                            int x_offset, int y_offset,
1755                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1756                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1757                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1758                            int list0, int list1){
1759     if((h->use_weight==2 && list0 && list1
1760         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1761        || h->use_weight==1)
1762         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1763                          x_offset, y_offset, qpix_put, chroma_put,
1764                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1765     else
1766         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1767                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1768 }
1769
1770 static inline void prefetch_motion(H264Context *h, int list){
1771     /* fetch pixels for estimated mv 4 macroblocks ahead
1772      * optimized for 64byte cache lines */
1773     MpegEncContext * const s = &h->s;
1774     const int refn = h->ref_cache[list][scan8[0]];
1775     if(refn >= 0){
1776         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1777         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1778         uint8_t **src= h->ref_list[list][refn].data;
1779         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1780         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1781         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1782         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1783     }
1784 }
1785
1786 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1787                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1788                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1789                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1790     MpegEncContext * const s = &h->s;
1791     const int mb_xy= h->mb_xy;
1792     const int mb_type= s->current_picture.mb_type[mb_xy];
1793
1794     assert(IS_INTER(mb_type));
1795
1796     prefetch_motion(h, 0);
1797
1798     if(IS_16X16(mb_type)){
1799         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1800                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1801                 &weight_op[0], &weight_avg[0],
1802                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1803     }else if(IS_16X8(mb_type)){
1804         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1805                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1806                 &weight_op[1], &weight_avg[1],
1807                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1808         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1809                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1810                 &weight_op[1], &weight_avg[1],
1811                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1812     }else if(IS_8X16(mb_type)){
1813         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1815                 &weight_op[2], &weight_avg[2],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1818                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1819                 &weight_op[2], &weight_avg[2],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else{
1822         int i;
1823
1824         assert(IS_8X8(mb_type));
1825
1826         for(i=0; i<4; i++){
1827             const int sub_mb_type= h->sub_mb_type[i];
1828             const int n= 4*i;
1829             int x_offset= (i&1)<<2;
1830             int y_offset= (i&2)<<1;
1831
1832             if(IS_SUB_8X8(sub_mb_type)){
1833                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1834                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1835                     &weight_op[3], &weight_avg[3],
1836                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1837             }else if(IS_SUB_8X4(sub_mb_type)){
1838                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1839                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1840                     &weight_op[4], &weight_avg[4],
1841                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1842                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1843                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1844                     &weight_op[4], &weight_avg[4],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_4X8(sub_mb_type)){
1847                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1849                     &weight_op[5], &weight_avg[5],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1852                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1853                     &weight_op[5], &weight_avg[5],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else{
1856                 int j;
1857                 assert(IS_SUB_4X4(sub_mb_type));
1858                 for(j=0; j<4; j++){
1859                     int sub_x_offset= x_offset + 2*(j&1);
1860                     int sub_y_offset= y_offset +   (j&2);
1861                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1862                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1863                         &weight_op[6], &weight_avg[6],
1864                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1865                 }
1866             }
1867         }
1868     }
1869
1870     prefetch_motion(h, 1);
1871 }
1872
1873 static av_cold void decode_init_vlc(void){
1874     static int done = 0;
1875
1876     if (!done) {
1877         int i;
1878         int offset;
1879         done = 1;
1880
1881         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1882         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1883         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1884                  &chroma_dc_coeff_token_len [0], 1, 1,
1885                  &chroma_dc_coeff_token_bits[0], 1, 1,
1886                  INIT_VLC_USE_NEW_STATIC);
1887
1888         offset = 0;
1889         for(i=0; i<4; i++){
1890             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1891             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1892             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1893                      &coeff_token_len [i][0], 1, 1,
1894                      &coeff_token_bits[i][0], 1, 1,
1895                      INIT_VLC_USE_NEW_STATIC);
1896             offset += coeff_token_vlc_tables_size[i];
1897         }
1898         /*
1899          * This is a one time safety check to make sure that
1900          * the packed static coeff_token_vlc table sizes
1901          * were initialized correctly.
1902          */
1903         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1904
1905         for(i=0; i<3; i++){
1906             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1907             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1908             init_vlc(&chroma_dc_total_zeros_vlc[i],
1909                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1910                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1911                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1912                      INIT_VLC_USE_NEW_STATIC);
1913         }
1914         for(i=0; i<15; i++){
1915             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1916             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1917             init_vlc(&total_zeros_vlc[i],
1918                      TOTAL_ZEROS_VLC_BITS, 16,
1919                      &total_zeros_len [i][0], 1, 1,
1920                      &total_zeros_bits[i][0], 1, 1,
1921                      INIT_VLC_USE_NEW_STATIC);
1922         }
1923
1924         for(i=0; i<6; i++){
1925             run_vlc[i].table = run_vlc_tables[i];
1926             run_vlc[i].table_allocated = run_vlc_tables_size;
1927             init_vlc(&run_vlc[i],
1928                      RUN_VLC_BITS, 7,
1929                      &run_len [i][0], 1, 1,
1930                      &run_bits[i][0], 1, 1,
1931                      INIT_VLC_USE_NEW_STATIC);
1932         }
1933         run7_vlc.table = run7_vlc_table,
1934         run7_vlc.table_allocated = run7_vlc_table_size;
1935         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1936                  &run_len [6][0], 1, 1,
1937                  &run_bits[6][0], 1, 1,
1938                  INIT_VLC_USE_NEW_STATIC);
1939     }
1940 }
1941
1942 static void free_tables(H264Context *h){
1943     int i;
1944     H264Context *hx;
1945     av_freep(&h->intra4x4_pred_mode);
1946     av_freep(&h->chroma_pred_mode_table);
1947     av_freep(&h->cbp_table);
1948     av_freep(&h->mvd_table[0]);
1949     av_freep(&h->mvd_table[1]);
1950     av_freep(&h->direct_table);
1951     av_freep(&h->non_zero_count);
1952     av_freep(&h->slice_table_base);
1953     h->slice_table= NULL;
1954
1955     av_freep(&h->mb2b_xy);
1956     av_freep(&h->mb2b8_xy);
1957
1958     for(i = 0; i < h->s.avctx->thread_count; i++) {
1959         hx = h->thread_context[i];
1960         if(!hx) continue;
1961         av_freep(&hx->top_borders[1]);
1962         av_freep(&hx->top_borders[0]);
1963         av_freep(&hx->s.obmc_scratchpad);
1964     }
1965 }
1966
1967 static void init_dequant8_coeff_table(H264Context *h){
1968     int i,q,x;
1969     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1970     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1971     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1972
1973     for(i=0; i<2; i++ ){
1974         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1975             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1976             break;
1977         }
1978
1979         for(q=0; q<52; q++){
1980             int shift = div6[q];
1981             int idx = rem6[q];
1982             for(x=0; x<64; x++)
1983                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1984                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1985                     h->pps.scaling_matrix8[i][x]) << shift;
1986         }
1987     }
1988 }
1989
1990 static void init_dequant4_coeff_table(H264Context *h){
1991     int i,j,q,x;
1992     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1993     for(i=0; i<6; i++ ){
1994         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1995         for(j=0; j<i; j++){
1996             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1997                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1998                 break;
1999             }
2000         }
2001         if(j<i)
2002             continue;
2003
2004         for(q=0; q<52; q++){
2005             int shift = div6[q] + 2;
2006             int idx = rem6[q];
2007             for(x=0; x<16; x++)
2008                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2009                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2010                     h->pps.scaling_matrix4[i][x]) << shift;
2011         }
2012     }
2013 }
2014
2015 static void init_dequant_tables(H264Context *h){
2016     int i,x;
2017     init_dequant4_coeff_table(h);
2018     if(h->pps.transform_8x8_mode)
2019         init_dequant8_coeff_table(h);
2020     if(h->sps.transform_bypass){
2021         for(i=0; i<6; i++)
2022             for(x=0; x<16; x++)
2023                 h->dequant4_coeff[i][0][x] = 1<<6;
2024         if(h->pps.transform_8x8_mode)
2025             for(i=0; i<2; i++)
2026                 for(x=0; x<64; x++)
2027                     h->dequant8_coeff[i][0][x] = 1<<6;
2028     }
2029 }
2030
2031
2032 /**
2033  * allocates tables.
2034  * needs width/height
2035  */
2036 static int alloc_tables(H264Context *h){
2037     MpegEncContext * const s = &h->s;
2038     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2039     int x,y;
2040
2041     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2042
2043     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2044     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2045     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2046
2047     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2048     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2050     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2051
2052     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2053     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2054
2055     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2056     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2057     for(y=0; y<s->mb_height; y++){
2058         for(x=0; x<s->mb_width; x++){
2059             const int mb_xy= x + y*s->mb_stride;
2060             const int b_xy = 4*x + 4*y*h->b_stride;
2061             const int b8_xy= 2*x + 2*y*h->b8_stride;
2062
2063             h->mb2b_xy [mb_xy]= b_xy;
2064             h->mb2b8_xy[mb_xy]= b8_xy;
2065         }
2066     }
2067
2068     s->obmc_scratchpad = NULL;
2069
2070     if(!h->dequant4_coeff[0])
2071         init_dequant_tables(h);
2072
2073     return 0;
2074 fail:
2075     free_tables(h);
2076     return -1;
2077 }
2078
2079 /**
2080  * Mimic alloc_tables(), but for every context thread.
2081  */
2082 static void clone_tables(H264Context *dst, H264Context *src){
2083     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2084     dst->non_zero_count           = src->non_zero_count;
2085     dst->slice_table              = src->slice_table;
2086     dst->cbp_table                = src->cbp_table;
2087     dst->mb2b_xy                  = src->mb2b_xy;
2088     dst->mb2b8_xy                 = src->mb2b8_xy;
2089     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2090     dst->mvd_table[0]             = src->mvd_table[0];
2091     dst->mvd_table[1]             = src->mvd_table[1];
2092     dst->direct_table             = src->direct_table;
2093
2094     dst->s.obmc_scratchpad = NULL;
2095     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2096 }
2097
2098 /**
2099  * Init context
2100  * Allocate buffers which are not shared amongst multiple threads.
2101  */
2102 static int context_init(H264Context *h){
2103     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2105
2106     return 0;
2107 fail:
2108     return -1; // free_tables will clean up for us
2109 }
2110
2111 static av_cold void common_init(H264Context *h){
2112     MpegEncContext * const s = &h->s;
2113
2114     s->width = s->avctx->width;
2115     s->height = s->avctx->height;
2116     s->codec_id= s->avctx->codec->id;
2117
2118     ff_h264_pred_init(&h->hpc, s->codec_id);
2119
2120     h->dequant_coeff_pps= -1;
2121     s->unrestricted_mv=1;
2122     s->decode=1; //FIXME
2123
2124     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2125     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2126 }
2127
2128 static av_cold int decode_init(AVCodecContext *avctx){
2129     H264Context *h= avctx->priv_data;
2130     MpegEncContext * const s = &h->s;
2131
2132     MPV_decode_defaults(s);
2133
2134     s->avctx = avctx;
2135     common_init(h);
2136
2137     s->out_format = FMT_H264;
2138     s->workaround_bugs= avctx->workaround_bugs;
2139
2140     // set defaults
2141 //    s->decode_mb= ff_h263_decode_mb;
2142     s->quarter_sample = 1;
2143     s->low_delay= 1;
2144
2145     if(avctx->codec_id == CODEC_ID_SVQ3)
2146         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2147     else
2148         avctx->pix_fmt= PIX_FMT_YUV420P;
2149
2150     decode_init_vlc();
2151
2152     if(avctx->extradata_size > 0 && avctx->extradata &&
2153        *(char *)avctx->extradata == 1){
2154         h->is_avc = 1;
2155         h->got_avcC = 0;
2156     } else {
2157         h->is_avc = 0;
2158     }
2159
2160     h->thread_context[0] = h;
2161     h->outputed_poc = INT_MIN;
2162     h->prev_poc_msb= 1<<16;
2163     return 0;
2164 }
2165
2166 static int frame_start(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168     int i;
2169
2170     if(MPV_frame_start(s, s->avctx) < 0)
2171         return -1;
2172     ff_er_frame_start(s);
2173     /*
2174      * MPV_frame_start uses pict_type to derive key_frame.
2175      * This is incorrect for H.264; IDR markings must be used.
2176      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2177      * See decode_nal_units().
2178      */
2179     s->current_picture_ptr->key_frame= 0;
2180
2181     assert(s->linesize && s->uvlinesize);
2182
2183     for(i=0; i<16; i++){
2184         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2185         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2186     }
2187     for(i=0; i<4; i++){
2188         h->block_offset[16+i]=
2189         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2190         h->block_offset[24+16+i]=
2191         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2192     }
2193
2194     /* can't be in alloc_tables because linesize isn't known there.
2195      * FIXME: redo bipred weight to not require extra buffer? */
2196     for(i = 0; i < s->avctx->thread_count; i++)
2197         if(!h->thread_context[i]->s.obmc_scratchpad)
2198             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2199
2200     /* some macroblocks will be accessed before they're available */
2201     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2202         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2203
2204 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2205
2206     // We mark the current picture as non-reference after allocating it, so
2207     // that if we break out due to an error it can be released automatically
2208     // in the next MPV_frame_start().
2209     // SVQ3 as well as most other codecs have only last/next/current and thus
2210     // get released even with set reference, besides SVQ3 and others do not
2211     // mark frames as reference later "naturally".
2212     if(s->codec_id != CODEC_ID_SVQ3)
2213         s->current_picture_ptr->reference= 0;
2214
2215     s->current_picture_ptr->field_poc[0]=
2216     s->current_picture_ptr->field_poc[1]= INT_MAX;
2217     assert(s->current_picture_ptr->long_ref==0);
2218
2219     return 0;
2220 }
2221
2222 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2223     MpegEncContext * const s = &h->s;
2224     int i;
2225     int step    = 1;
2226     int offset  = 1;
2227     int uvoffset= 1;
2228     int top_idx = 1;
2229     int skiplast= 0;
2230
2231     src_y  -=   linesize;
2232     src_cb -= uvlinesize;
2233     src_cr -= uvlinesize;
2234
2235     if(!simple && FRAME_MBAFF){
2236         if(s->mb_y&1){
2237             offset  = MB_MBAFF ? 1 : 17;
2238             uvoffset= MB_MBAFF ? 1 : 9;
2239             if(!MB_MBAFF){
2240                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2242                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2243                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2245                 }
2246             }
2247         }else{
2248             if(!MB_MBAFF){
2249                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2250                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2251                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2252                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2253                 }
2254                 skiplast= 1;
2255             }
2256             offset  =
2257             uvoffset=
2258             top_idx = MB_MBAFF ? 0 : 1;
2259         }
2260         step= MB_MBAFF ? 2 : 1;
2261     }
2262
2263     // There are two lines saved, the line above the the top macroblock of a pair,
2264     // and the line above the bottom macroblock
2265     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2266     for(i=1; i<17 - skiplast; i++){
2267         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2268     }
2269
2270     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2272
2273     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2274         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2275         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2276         for(i=1; i<9 - skiplast; i++){
2277             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2278             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2279         }
2280         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2282     }
2283 }
2284
2285 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2286     MpegEncContext * const s = &h->s;
2287     int temp8, i;
2288     uint64_t temp64;
2289     int deblock_left;
2290     int deblock_top;
2291     int mb_xy;
2292     int step    = 1;
2293     int offset  = 1;
2294     int uvoffset= 1;
2295     int top_idx = 1;
2296
2297     if(!simple && FRAME_MBAFF){
2298         if(s->mb_y&1){
2299             offset  = MB_MBAFF ? 1 : 17;
2300             uvoffset= MB_MBAFF ? 1 : 9;
2301         }else{
2302             offset  =
2303             uvoffset=
2304             top_idx = MB_MBAFF ? 0 : 1;
2305         }
2306         step= MB_MBAFF ? 2 : 1;
2307     }
2308
2309     if(h->deblocking_filter == 2) {
2310         mb_xy = h->mb_xy;
2311         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2312         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2313     } else {
2314         deblock_left = (s->mb_x > 0);
2315         deblock_top =  (s->mb_y > !!MB_FIELD);
2316     }
2317
2318     src_y  -=   linesize + 1;
2319     src_cb -= uvlinesize + 1;
2320     src_cr -= uvlinesize + 1;
2321
2322 #define XCHG(a,b,t,xchg)\
2323 t= a;\
2324 if(xchg)\
2325     a= b;\
2326 b= t;
2327
2328     if(deblock_left){
2329         for(i = !deblock_top; i<16; i++){
2330             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2331         }
2332         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2333     }
2334
2335     if(deblock_top){
2336         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2338         if(s->mb_x+1 < s->mb_width){
2339             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2340         }
2341     }
2342
2343     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2344         if(deblock_left){
2345             for(i = !deblock_top; i<8; i++){
2346                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2347                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2348             }
2349             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2350             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2351         }
2352         if(deblock_top){
2353             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2355         }
2356     }
2357 }
2358
2359 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2360     MpegEncContext * const s = &h->s;
2361     const int mb_x= s->mb_x;
2362     const int mb_y= s->mb_y;
2363     const int mb_xy= h->mb_xy;
2364     const int mb_type= s->current_picture.mb_type[mb_xy];
2365     uint8_t  *dest_y, *dest_cb, *dest_cr;
2366     int linesize, uvlinesize /*dct_offset*/;
2367     int i;
2368     int *block_offset = &h->block_offset[0];
2369     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2370     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2371     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2372     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2373
2374     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2375     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2376     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2377
2378     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2379     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2380
2381     if (!simple && MB_FIELD) {
2382         linesize   = h->mb_linesize   = s->linesize * 2;
2383         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2384         block_offset = &h->block_offset[24];
2385         if(mb_y&1){ //FIXME move out of this function?
2386             dest_y -= s->linesize*15;
2387             dest_cb-= s->uvlinesize*7;
2388             dest_cr-= s->uvlinesize*7;
2389         }
2390         if(FRAME_MBAFF) {
2391             int list;
2392             for(list=0; list<h->list_count; list++){
2393                 if(!USES_LIST(mb_type, list))
2394                     continue;
2395                 if(IS_16X16(mb_type)){
2396                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2397                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2398                 }else{
2399                     for(i=0; i<16; i+=4){
2400                         int ref = h->ref_cache[list][scan8[i]];
2401                         if(ref >= 0)
2402                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2403                     }
2404                 }
2405             }
2406         }
2407     } else {
2408         linesize   = h->mb_linesize   = s->linesize;
2409         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2410 //        dct_offset = s->linesize * 16;
2411     }
2412
2413     if (!simple && IS_INTRA_PCM(mb_type)) {
2414         for (i=0; i<16; i++) {
2415             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2416         }
2417         for (i=0; i<8; i++) {
2418             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2419             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2420         }
2421     } else {
2422         if(IS_INTRA(mb_type)){
2423             if(h->deblocking_filter)
2424                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2425
2426             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2427                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2428                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2429             }
2430
2431             if(IS_INTRA4x4(mb_type)){
2432                 if(simple || !s->encoding){
2433                     if(IS_8x8DCT(mb_type)){
2434                         if(transform_bypass){
2435                             idct_dc_add =
2436                             idct_add    = s->dsp.add_pixels8;
2437                         }else{
2438                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2439                             idct_add    = s->dsp.h264_idct8_add;
2440                         }
2441                         for(i=0; i<16; i+=4){
2442                             uint8_t * const ptr= dest_y + block_offset[i];
2443                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2444                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2445                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2446                             }else{
2447                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2448                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2449                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2450                                 if(nnz){
2451                                     if(nnz == 1 && h->mb[i*16])
2452                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2453                                     else
2454                                         idct_add   (ptr, h->mb + i*16, linesize);
2455                                 }
2456                             }
2457                         }
2458                     }else{
2459                         if(transform_bypass){
2460                             idct_dc_add =
2461                             idct_add    = s->dsp.add_pixels4;
2462                         }else{
2463                             idct_dc_add = s->dsp.h264_idct_dc_add;
2464                             idct_add    = s->dsp.h264_idct_add;
2465                         }
2466                         for(i=0; i<16; i++){
2467                             uint8_t * const ptr= dest_y + block_offset[i];
2468                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2469
2470                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2471                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2472                             }else{
2473                                 uint8_t *topright;
2474                                 int nnz, tr;
2475                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2476                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2477                                     assert(mb_y || linesize <= block_offset[i]);
2478                                     if(!topright_avail){
2479                                         tr= ptr[3 - linesize]*0x01010101;
2480                                         topright= (uint8_t*) &tr;
2481                                     }else
2482                                         topright= ptr + 4 - linesize;
2483                                 }else
2484                                     topright= NULL;
2485
2486                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2487                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2488                                 if(nnz){
2489                                     if(is_h264){
2490                                         if(nnz == 1 && h->mb[i*16])
2491                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2492                                         else
2493                                             idct_add   (ptr, h->mb + i*16, linesize);
2494                                     }else
2495                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2496                                 }
2497                             }
2498                         }
2499                     }
2500                 }
2501             }else{
2502                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2503                 if(is_h264){
2504                     if(!transform_bypass)
2505                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2506                 }else
2507                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2508             }
2509             if(h->deblocking_filter)
2510                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2511         }else if(is_h264){
2512             hl_motion(h, dest_y, dest_cb, dest_cr,
2513                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2514                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2515                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2516         }
2517
2518
2519         if(!IS_INTRA4x4(mb_type)){
2520             if(is_h264){
2521                 if(IS_INTRA16x16(mb_type)){
2522                     if(transform_bypass){
2523                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2524                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2525                         }else{
2526                             for(i=0; i<16; i++){
2527                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2528                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2529                             }
2530                         }
2531                     }else{
2532                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2533                     }
2534                 }else if(h->cbp&15){
2535                     if(transform_bypass){
2536                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2537                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2538                         for(i=0; i<16; i+=di){
2539                             if(h->non_zero_count_cache[ scan8[i] ]){
2540                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2541                             }
2542                         }
2543                     }else{
2544                         if(IS_8x8DCT(mb_type)){
2545                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2546                         }else{
2547                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2548                         }
2549                     }
2550                 }
2551             }else{
2552                 for(i=0; i<16; i++){
2553                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2554                         uint8_t * const ptr= dest_y + block_offset[i];
2555                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2556                     }
2557                 }
2558             }
2559         }
2560
2561         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2562             uint8_t *dest[2] = {dest_cb, dest_cr};
2563             if(transform_bypass){
2564                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2565                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2566                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2567                 }else{
2568                     idct_add = s->dsp.add_pixels4;
2569                     for(i=16; i<16+8; i++){
2570                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2571                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2572                     }
2573                 }
2574             }else{
2575                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2576                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2577                 if(is_h264){
2578                     idct_add = s->dsp.h264_idct_add;
2579                     idct_dc_add = s->dsp.h264_idct_dc_add;
2580                     for(i=16; i<16+8; i++){
2581                         if(h->non_zero_count_cache[ scan8[i] ])
2582                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2583                         else if(h->mb[i*16])
2584                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2585                     }
2586                 }else{
2587                     for(i=16; i<16+8; i++){
2588                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2589                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2590                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2591                         }
2592                     }
2593                 }
2594             }
2595         }
2596     }
2597     if(h->deblocking_filter) {
2598         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2599         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2600         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2601         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2602         if (!simple && FRAME_MBAFF) {
2603             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2604         } else {
2605             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2606         }
2607     }
2608 }
2609
2610 /**
2611  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2612  */
2613 static void hl_decode_mb_simple(H264Context *h){
2614     hl_decode_mb_internal(h, 1);
2615 }
2616
2617 /**
2618  * Process a macroblock; this handles edge cases, such as interlacing.
2619  */
2620 static void av_noinline hl_decode_mb_complex(H264Context *h){
2621     hl_decode_mb_internal(h, 0);
2622 }
2623
2624 static void hl_decode_mb(H264Context *h){
2625     MpegEncContext * const s = &h->s;
2626     const int mb_xy= h->mb_xy;
2627     const int mb_type= s->current_picture.mb_type[mb_xy];
2628     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2629
2630     if(ENABLE_H264_ENCODER && !s->decode)
2631         return;
2632
2633     if (is_complex)
2634         hl_decode_mb_complex(h);
2635     else hl_decode_mb_simple(h);
2636 }
2637
2638 static void pic_as_field(Picture *pic, const int parity){
2639     int i;
2640     for (i = 0; i < 4; ++i) {
2641         if (parity == PICT_BOTTOM_FIELD)
2642             pic->data[i] += pic->linesize[i];
2643         pic->reference = parity;
2644         pic->linesize[i] *= 2;
2645     }
2646     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2647 }
2648
2649 static int split_field_copy(Picture *dest, Picture *src,
2650                             int parity, int id_add){
2651     int match = !!(src->reference & parity);
2652
2653     if (match) {
2654         *dest = *src;
2655         if(parity != PICT_FRAME){
2656             pic_as_field(dest, parity);
2657             dest->pic_id *= 2;
2658             dest->pic_id += id_add;
2659         }
2660     }
2661
2662     return match;
2663 }
2664
2665 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2666     int i[2]={0};
2667     int index=0;
2668
2669     while(i[0]<len || i[1]<len){
2670         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2671             i[0]++;
2672         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2673             i[1]++;
2674         if(i[0] < len){
2675             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2676             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2677         }
2678         if(i[1] < len){
2679             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2680             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2681         }
2682     }
2683
2684     return index;
2685 }
2686
2687 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2688     int i, best_poc;
2689     int out_i= 0;
2690
2691     for(;;){
2692         best_poc= dir ? INT_MIN : INT_MAX;
2693
2694         for(i=0; i<len; i++){
2695             const int poc= src[i]->poc;
2696             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2697                 best_poc= poc;
2698                 sorted[out_i]= src[i];
2699             }
2700         }
2701         if(best_poc == (dir ? INT_MIN : INT_MAX))
2702             break;
2703         limit= sorted[out_i++]->poc - dir;
2704     }
2705     return out_i;
2706 }
2707
2708 /**
2709  * fills the default_ref_list.
2710  */
2711 static int fill_default_ref_list(H264Context *h){
2712     MpegEncContext * const s = &h->s;
2713     int i, len;
2714
2715     if(h->slice_type_nos==FF_B_TYPE){
2716         Picture *sorted[32];
2717         int cur_poc, list;
2718         int lens[2];
2719
2720         if(FIELD_PICTURE)
2721             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2722         else
2723             cur_poc= s->current_picture_ptr->poc;
2724
2725         for(list= 0; list<2; list++){
2726             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2727             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2728             assert(len<=32);
2729             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2730             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2731             assert(len<=32);
2732
2733             if(len < h->ref_count[list])
2734                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2735             lens[list]= len;
2736         }
2737
2738         if(lens[0] == lens[1] && lens[1] > 1){
2739             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2740             if(i == lens[0])
2741                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2742         }
2743     }else{
2744         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2745         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2746         assert(len <= 32);
2747         if(len < h->ref_count[0])
2748             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2749     }
2750 #ifdef TRACE
2751     for (i=0; i<h->ref_count[0]; i++) {
2752         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2753     }
2754     if(h->slice_type_nos==FF_B_TYPE){
2755         for (i=0; i<h->ref_count[1]; i++) {
2756             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2757         }
2758     }
2759 #endif
2760     return 0;
2761 }
2762
2763 static void print_short_term(H264Context *h);
2764 static void print_long_term(H264Context *h);
2765
2766 /**
2767  * Extract structure information about the picture described by pic_num in
2768  * the current decoding context (frame or field). Note that pic_num is
2769  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2770  * @param pic_num picture number for which to extract structure information
2771  * @param structure one of PICT_XXX describing structure of picture
2772  *                      with pic_num
2773  * @return frame number (short term) or long term index of picture
2774  *         described by pic_num
2775  */
2776 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2777     MpegEncContext * const s = &h->s;
2778
2779     *structure = s->picture_structure;
2780     if(FIELD_PICTURE){
2781         if (!(pic_num & 1))
2782             /* opposite field */
2783             *structure ^= PICT_FRAME;
2784         pic_num >>= 1;
2785     }
2786
2787     return pic_num;
2788 }
2789
2790 static int decode_ref_pic_list_reordering(H264Context *h){
2791     MpegEncContext * const s = &h->s;
2792     int list, index, pic_structure;
2793
2794     print_short_term(h);
2795     print_long_term(h);
2796
2797     for(list=0; list<h->list_count; list++){
2798         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2799
2800         if(get_bits1(&s->gb)){
2801             int pred= h->curr_pic_num;
2802
2803             for(index=0; ; index++){
2804                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2805                 unsigned int pic_id;
2806                 int i;
2807                 Picture *ref = NULL;
2808
2809                 if(reordering_of_pic_nums_idc==3)
2810                     break;
2811
2812                 if(index >= h->ref_count[list]){
2813                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2814                     return -1;
2815                 }
2816
2817                 if(reordering_of_pic_nums_idc<3){
2818                     if(reordering_of_pic_nums_idc<2){
2819                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2820                         int frame_num;
2821
2822                         if(abs_diff_pic_num > h->max_pic_num){
2823                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2824                             return -1;
2825                         }
2826
2827                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2828                         else                                pred+= abs_diff_pic_num;
2829                         pred &= h->max_pic_num - 1;
2830
2831                         frame_num = pic_num_extract(h, pred, &pic_structure);
2832
2833                         for(i= h->short_ref_count-1; i>=0; i--){
2834                             ref = h->short_ref[i];
2835                             assert(ref->reference);
2836                             assert(!ref->long_ref);
2837                             if(
2838                                    ref->frame_num == frame_num &&
2839                                    (ref->reference & pic_structure)
2840                               )
2841                                 break;
2842                         }
2843                         if(i>=0)
2844                             ref->pic_id= pred;
2845                     }else{
2846                         int long_idx;
2847                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2848
2849                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2850
2851                         if(long_idx>31){
2852                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2853                             return -1;
2854                         }
2855                         ref = h->long_ref[long_idx];
2856                         assert(!(ref && !ref->reference));
2857                         if(ref && (ref->reference & pic_structure)){
2858                             ref->pic_id= pic_id;
2859                             assert(ref->long_ref);
2860                             i=0;
2861                         }else{
2862                             i=-1;
2863                         }
2864                     }
2865
2866                     if (i < 0) {
2867                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2868                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2869                     } else {
2870                         for(i=index; i+1<h->ref_count[list]; i++){
2871                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2872                                 break;
2873                         }
2874                         for(; i > index; i--){
2875                             h->ref_list[list][i]= h->ref_list[list][i-1];
2876                         }
2877                         h->ref_list[list][index]= *ref;
2878                         if (FIELD_PICTURE){
2879                             pic_as_field(&h->ref_list[list][index], pic_structure);
2880                         }
2881                     }
2882                 }else{
2883                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2884                     return -1;
2885                 }
2886             }
2887         }
2888     }
2889     for(list=0; list<h->list_count; list++){
2890         for(index= 0; index < h->ref_count[list]; index++){
2891             if(!h->ref_list[list][index].data[0]){
2892                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2893                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2894             }
2895         }
2896     }
2897
2898     return 0;
2899 }
2900
2901 static void fill_mbaff_ref_list(H264Context *h){
2902     int list, i, j;
2903     for(list=0; list<2; list++){ //FIXME try list_count
2904         for(i=0; i<h->ref_count[list]; i++){
2905             Picture *frame = &h->ref_list[list][i];
2906             Picture *field = &h->ref_list[list][16+2*i];
2907             field[0] = *frame;
2908             for(j=0; j<3; j++)
2909                 field[0].linesize[j] <<= 1;
2910             field[0].reference = PICT_TOP_FIELD;
2911             field[0].poc= field[0].field_poc[0];
2912             field[1] = field[0];
2913             for(j=0; j<3; j++)
2914                 field[1].data[j] += frame->linesize[j];
2915             field[1].reference = PICT_BOTTOM_FIELD;
2916             field[1].poc= field[1].field_poc[1];
2917
2918             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2919             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2920             for(j=0; j<2; j++){
2921                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2922                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2923             }
2924         }
2925     }
2926     for(j=0; j<h->ref_count[1]; j++){
2927         for(i=0; i<h->ref_count[0]; i++)
2928             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2929         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2930         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2931     }
2932 }
2933
2934 static int pred_weight_table(H264Context *h){
2935     MpegEncContext * const s = &h->s;
2936     int list, i;
2937     int luma_def, chroma_def;
2938
2939     h->use_weight= 0;
2940     h->use_weight_chroma= 0;
2941     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2942     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2943     luma_def = 1<<h->luma_log2_weight_denom;
2944     chroma_def = 1<<h->chroma_log2_weight_denom;
2945
2946     for(list=0; list<2; list++){
2947         for(i=0; i<h->ref_count[list]; i++){
2948             int luma_weight_flag, chroma_weight_flag;
2949
2950             luma_weight_flag= get_bits1(&s->gb);
2951             if(luma_weight_flag){
2952                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2953                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2954                 if(   h->luma_weight[list][i] != luma_def
2955                    || h->luma_offset[list][i] != 0)
2956                     h->use_weight= 1;
2957             }else{
2958                 h->luma_weight[list][i]= luma_def;
2959                 h->luma_offset[list][i]= 0;
2960             }
2961
2962             if(CHROMA){
2963                 chroma_weight_flag= get_bits1(&s->gb);
2964                 if(chroma_weight_flag){
2965                     int j;
2966                     for(j=0; j<2; j++){
2967                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2968                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2969                         if(   h->chroma_weight[list][i][j] != chroma_def
2970                         || h->chroma_offset[list][i][j] != 0)
2971                             h->use_weight_chroma= 1;
2972                     }
2973                 }else{
2974                     int j;
2975                     for(j=0; j<2; j++){
2976                         h->chroma_weight[list][i][j]= chroma_def;
2977                         h->chroma_offset[list][i][j]= 0;
2978                     }
2979                 }
2980             }
2981         }
2982         if(h->slice_type_nos != FF_B_TYPE) break;
2983     }
2984     h->use_weight= h->use_weight || h->use_weight_chroma;
2985     return 0;
2986 }
2987
2988 static void implicit_weight_table(H264Context *h){
2989     MpegEncContext * const s = &h->s;
2990     int ref0, ref1;
2991     int cur_poc = s->current_picture_ptr->poc;
2992
2993     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2994        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2995         h->use_weight= 0;
2996         h->use_weight_chroma= 0;
2997         return;
2998     }
2999
3000     h->use_weight= 2;
3001     h->use_weight_chroma= 2;
3002     h->luma_log2_weight_denom= 5;
3003     h->chroma_log2_weight_denom= 5;
3004
3005     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3006         int poc0 = h->ref_list[0][ref0].poc;
3007         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3008             int poc1 = h->ref_list[1][ref1].poc;
3009             int td = av_clip(poc1 - poc0, -128, 127);
3010             if(td){
3011                 int tb = av_clip(cur_poc - poc0, -128, 127);
3012                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3013                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3014                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3015                     h->implicit_weight[ref0][ref1] = 32;
3016                 else
3017                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3018             }else
3019                 h->implicit_weight[ref0][ref1] = 32;
3020         }
3021     }
3022 }
3023
3024 /**
3025  * Mark a picture as no longer needed for reference. The refmask
3026  * argument allows unreferencing of individual fields or the whole frame.
3027  * If the picture becomes entirely unreferenced, but is being held for
3028  * display purposes, it is marked as such.
3029  * @param refmask mask of fields to unreference; the mask is bitwise
3030  *                anded with the reference marking of pic
3031  * @return non-zero if pic becomes entirely unreferenced (except possibly
3032  *         for display purposes) zero if one of the fields remains in
3033  *         reference
3034  */
3035 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3036     int i;
3037     if (pic->reference &= refmask) {
3038         return 0;
3039     } else {
3040         for(i = 0; h->delayed_pic[i]; i++)
3041             if(pic == h->delayed_pic[i]){
3042                 pic->reference=DELAYED_PIC_REF;
3043                 break;
3044             }
3045         return 1;
3046     }
3047 }
3048
3049 /**
3050  * instantaneous decoder refresh.
3051  */
3052 static void idr(H264Context *h){
3053     int i;
3054
3055     for(i=0; i<16; i++){
3056         remove_long(h, i, 0);
3057     }
3058     assert(h->long_ref_count==0);
3059
3060     for(i=0; i<h->short_ref_count; i++){
3061         unreference_pic(h, h->short_ref[i], 0);
3062         h->short_ref[i]= NULL;
3063     }
3064     h->short_ref_count=0;
3065     h->prev_frame_num= 0;
3066     h->prev_frame_num_offset= 0;
3067     h->prev_poc_msb=
3068     h->prev_poc_lsb= 0;
3069 }
3070
3071 /* forget old pics after a seek */
3072 static void flush_dpb(AVCodecContext *avctx){
3073     H264Context *h= avctx->priv_data;
3074     int i;
3075     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3076         if(h->delayed_pic[i])
3077             h->delayed_pic[i]->reference= 0;
3078         h->delayed_pic[i]= NULL;
3079     }
3080     h->outputed_poc= INT_MIN;
3081     idr(h);
3082     if(h->s.current_picture_ptr)
3083         h->s.current_picture_ptr->reference= 0;
3084     h->s.first_field= 0;
3085     ff_mpeg_flush(avctx);
3086 }
3087
3088 /**
3089  * Find a Picture in the short term reference list by frame number.
3090  * @param frame_num frame number to search for
3091  * @param idx the index into h->short_ref where returned picture is found
3092  *            undefined if no picture found.
3093  * @return pointer to the found picture, or NULL if no pic with the provided
3094  *                 frame number is found
3095  */
3096 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3097     MpegEncContext * const s = &h->s;
3098     int i;
3099
3100     for(i=0; i<h->short_ref_count; i++){
3101         Picture *pic= h->short_ref[i];
3102         if(s->avctx->debug&FF_DEBUG_MMCO)
3103             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3104         if(pic->frame_num == frame_num) {
3105             *idx = i;
3106             return pic;
3107         }
3108     }
3109     return NULL;
3110 }
3111
3112 /**
3113  * Remove a picture from the short term reference list by its index in
3114  * that list.  This does no checking on the provided index; it is assumed
3115  * to be valid. Other list entries are shifted down.
3116  * @param i index into h->short_ref of picture to remove.
3117  */
3118 static void remove_short_at_index(H264Context *h, int i){
3119     assert(i >= 0 && i < h->short_ref_count);
3120     h->short_ref[i]= NULL;
3121     if (--h->short_ref_count)
3122         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3123 }
3124
3125 /**
3126  *
3127  * @return the removed picture or NULL if an error occurs
3128  */
3129 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3130     MpegEncContext * const s = &h->s;
3131     Picture *pic;
3132     int i;
3133
3134     if(s->avctx->debug&FF_DEBUG_MMCO)
3135         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3136
3137     pic = find_short(h, frame_num, &i);
3138     if (pic){
3139         if(unreference_pic(h, pic, ref_mask))
3140         remove_short_at_index(h, i);
3141     }
3142
3143     return pic;
3144 }
3145
3146 /**
3147  * Remove a picture from the long term reference list by its index in
3148  * that list.
3149  * @return the removed picture or NULL if an error occurs
3150  */
3151 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3152     Picture *pic;
3153
3154     pic= h->long_ref[i];
3155     if (pic){
3156         if(unreference_pic(h, pic, ref_mask)){
3157             assert(h->long_ref[i]->long_ref == 1);
3158             h->long_ref[i]->long_ref= 0;
3159             h->long_ref[i]= NULL;
3160             h->long_ref_count--;
3161         }
3162     }
3163
3164     return pic;
3165 }
3166
3167 /**
3168  * print short term list
3169  */
3170 static void print_short_term(H264Context *h) {
3171     uint32_t i;
3172     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3173         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3174         for(i=0; i<h->short_ref_count; i++){
3175             Picture *pic= h->short_ref[i];
3176             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3177         }
3178     }
3179 }
3180
3181 /**
3182  * print long term list
3183  */
3184 static void print_long_term(H264Context *h) {
3185     uint32_t i;
3186     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3187         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3188         for(i = 0; i < 16; i++){
3189             Picture *pic= h->long_ref[i];
3190             if (pic) {
3191                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3192             }
3193         }
3194     }
3195 }
3196
3197 /**
3198  * Executes the reference picture marking (memory management control operations).
3199  */
3200 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3201     MpegEncContext * const s = &h->s;
3202     int i, j;
3203     int current_ref_assigned=0;
3204     Picture *pic;
3205
3206     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3207         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3208
3209     for(i=0; i<mmco_count; i++){
3210         int structure, frame_num;
3211         if(s->avctx->debug&FF_DEBUG_MMCO)
3212             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3213
3214         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3215            || mmco[i].opcode == MMCO_SHORT2LONG){
3216             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3217             pic = find_short(h, frame_num, &j);
3218             if(!pic){
3219                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3220                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3221                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3222                 continue;
3223             }
3224         }
3225
3226         switch(mmco[i].opcode){
3227         case MMCO_SHORT2UNUSED:
3228             if(s->avctx->debug&FF_DEBUG_MMCO)
3229                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3230             remove_short(h, frame_num, structure ^ PICT_FRAME);
3231             break;
3232         case MMCO_SHORT2LONG:
3233                 if (h->long_ref[mmco[i].long_arg] != pic)
3234                     remove_long(h, mmco[i].long_arg, 0);
3235
3236                 remove_short_at_index(h, j);
3237                 h->long_ref[ mmco[i].long_arg ]= pic;
3238                 if (h->long_ref[ mmco[i].long_arg ]){
3239                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3240                     h->long_ref_count++;
3241                 }
3242             break;
3243         case MMCO_LONG2UNUSED:
3244             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3245             pic = h->long_ref[j];
3246             if (pic) {
3247                 remove_long(h, j, structure ^ PICT_FRAME);
3248             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3249                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3250             break;
3251         case MMCO_LONG:
3252                     // Comment below left from previous code as it is an interresting note.
3253                     /* First field in pair is in short term list or
3254                      * at a different long term index.
3255                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3256                      * Report the problem and keep the pair where it is,
3257                      * and mark this field valid.
3258                      */
3259
3260             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3261                 remove_long(h, mmco[i].long_arg, 0);
3262
3263                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3264                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3265                 h->long_ref_count++;
3266             }
3267
3268             s->current_picture_ptr->reference |= s->picture_structure;
3269             current_ref_assigned=1;
3270             break;
3271         case MMCO_SET_MAX_LONG:
3272             assert(mmco[i].long_arg <= 16);
3273             // just remove the long term which index is greater than new max
3274             for(j = mmco[i].long_arg; j<16; j++){
3275                 remove_long(h, j, 0);
3276             }
3277             break;
3278         case MMCO_RESET:
3279             while(h->short_ref_count){
3280                 remove_short(h, h->short_ref[0]->frame_num, 0);
3281             }
3282             for(j = 0; j < 16; j++) {
3283                 remove_long(h, j, 0);
3284             }
3285             s->current_picture_ptr->poc=
3286             s->current_picture_ptr->field_poc[0]=
3287             s->current_picture_ptr->field_poc[1]=
3288             h->poc_lsb=
3289             h->poc_msb=
3290             h->frame_num=
3291             s->current_picture_ptr->frame_num= 0;
3292             break;
3293         default: assert(0);
3294         }
3295     }
3296
3297     if (!current_ref_assigned) {
3298         /* Second field of complementary field pair; the first field of
3299          * which is already referenced. If short referenced, it
3300          * should be first entry in short_ref. If not, it must exist
3301          * in long_ref; trying to put it on the short list here is an
3302          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3303          */
3304         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3305             /* Just mark the second field valid */
3306             s->current_picture_ptr->reference = PICT_FRAME;
3307         } else if (s->current_picture_ptr->long_ref) {
3308             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3309                                              "assignment for second field "
3310                                              "in complementary field pair "
3311                                              "(first field is long term)\n");
3312         } else {
3313             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3314             if(pic){
3315                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3316             }
3317
3318             if(h->short_ref_count)
3319                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3320
3321             h->short_ref[0]= s->current_picture_ptr;
3322             h->short_ref_count++;
3323             s->current_picture_ptr->reference |= s->picture_structure;
3324         }
3325     }
3326
3327     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3328
3329         /* We have too many reference frames, probably due to corrupted
3330          * stream. Need to discard one frame. Prevents overrun of the
3331          * short_ref and long_ref buffers.
3332          */
3333         av_log(h->s.avctx, AV_LOG_ERROR,
3334                "number of reference frames exceeds max (probably "
3335                "corrupt input), discarding one\n");
3336
3337         if (h->long_ref_count && !h->short_ref_count) {
3338             for (i = 0; i < 16; ++i)
3339                 if (h->long_ref[i])
3340                     break;
3341
3342             assert(i < 16);
3343             remove_long(h, i, 0);
3344         } else {
3345             pic = h->short_ref[h->short_ref_count - 1];
3346             remove_short(h, pic->frame_num, 0);
3347         }
3348     }
3349
3350     print_short_term(h);
3351     print_long_term(h);
3352     return 0;
3353 }
3354
3355 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3356     MpegEncContext * const s = &h->s;
3357     int i;
3358
3359     h->mmco_index= 0;
3360     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3361         s->broken_link= get_bits1(gb) -1;
3362         if(get_bits1(gb)){
3363             h->mmco[0].opcode= MMCO_LONG;
3364             h->mmco[0].long_arg= 0;
3365             h->mmco_index= 1;
3366         }
3367     }else{
3368         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3369             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3370                 MMCOOpcode opcode= get_ue_golomb(gb);
3371
3372                 h->mmco[i].opcode= opcode;
3373                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3374                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3375 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3376                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3377                         return -1;
3378                     }*/
3379                 }
3380                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3381                     unsigned int long_arg= get_ue_golomb(gb);
3382                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3383                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3384                         return -1;
3385                     }
3386                     h->mmco[i].long_arg= long_arg;
3387                 }
3388
3389                 if(opcode > (unsigned)MMCO_LONG){
3390                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3391                     return -1;
3392                 }
3393                 if(opcode == MMCO_END)
3394                     break;
3395             }
3396             h->mmco_index= i;
3397         }else{
3398             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3399
3400             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3401                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3402                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3403                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3404                 h->mmco_index= 1;
3405                 if (FIELD_PICTURE) {
3406                     h->mmco[0].short_pic_num *= 2;
3407                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3408                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3409                     h->mmco_index= 2;
3410                 }
3411             }
3412         }
3413     }
3414
3415     return 0;
3416 }
3417
3418 static int init_poc(H264Context *h){
3419     MpegEncContext * const s = &h->s;
3420     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3421     int field_poc[2];
3422     Picture *cur = s->current_picture_ptr;
3423
3424     h->frame_num_offset= h->prev_frame_num_offset;
3425     if(h->frame_num < h->prev_frame_num)
3426         h->frame_num_offset += max_frame_num;
3427
3428     if(h->sps.poc_type==0){
3429         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3430
3431         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3432             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3433         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3434             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3435         else
3436             h->poc_msb = h->prev_poc_msb;
3437 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3438         field_poc[0] =
3439         field_poc[1] = h->poc_msb + h->poc_lsb;
3440         if(s->picture_structure == PICT_FRAME)
3441             field_poc[1] += h->delta_poc_bottom;
3442     }else if(h->sps.poc_type==1){
3443         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3444         int i;
3445
3446         if(h->sps.poc_cycle_length != 0)
3447             abs_frame_num = h->frame_num_offset + h->frame_num;
3448         else
3449             abs_frame_num = 0;
3450
3451         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3452             abs_frame_num--;
3453
3454         expected_delta_per_poc_cycle = 0;
3455         for(i=0; i < h->sps.poc_cycle_length; i++)
3456             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3457
3458         if(abs_frame_num > 0){
3459             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3460             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3461
3462             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3463             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3464                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3465         } else
3466             expectedpoc = 0;
3467
3468         if(h->nal_ref_idc == 0)
3469             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3470
3471         field_poc[0] = expectedpoc + h->delta_poc[0];
3472         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3473
3474         if(s->picture_structure == PICT_FRAME)
3475             field_poc[1] += h->delta_poc[1];
3476     }else{
3477         int poc= 2*(h->frame_num_offset + h->frame_num);
3478
3479         if(!h->nal_ref_idc)
3480             poc--;
3481
3482         field_poc[0]= poc;
3483         field_poc[1]= poc;
3484     }
3485
3486     if(s->picture_structure != PICT_BOTTOM_FIELD)
3487         s->current_picture_ptr->field_poc[0]= field_poc[0];
3488     if(s->picture_structure != PICT_TOP_FIELD)
3489         s->current_picture_ptr->field_poc[1]= field_poc[1];
3490     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3491
3492     return 0;
3493 }
3494
3495
3496 /**
3497  * initialize scan tables
3498  */
3499 static void init_scan_tables(H264Context *h){
3500     MpegEncContext * const s = &h->s;
3501     int i;
3502     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3503         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3504         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3505     }else{
3506         for(i=0; i<16; i++){
3507 #define T(x) (x>>2) | ((x<<2) & 0xF)
3508             h->zigzag_scan[i] = T(zigzag_scan[i]);
3509             h-> field_scan[i] = T( field_scan[i]);
3510 #undef T
3511         }
3512     }
3513     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3514         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3515         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3516         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3517         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3518     }else{
3519         for(i=0; i<64; i++){
3520 #define T(x) (x>>3) | ((x&7)<<3)
3521             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3522             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3523             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3524             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3525 #undef T
3526         }
3527     }
3528     if(h->sps.transform_bypass){ //FIXME same ugly
3529         h->zigzag_scan_q0          = zigzag_scan;
3530         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3531         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3532         h->field_scan_q0           = field_scan;
3533         h->field_scan8x8_q0        = field_scan8x8;
3534         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3535     }else{
3536         h->zigzag_scan_q0          = h->zigzag_scan;
3537         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3538         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3539         h->field_scan_q0           = h->field_scan;
3540         h->field_scan8x8_q0        = h->field_scan8x8;
3541         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3542     }
3543 }
3544
3545 /**
3546  * Replicates H264 "master" context to thread contexts.
3547  */
3548 static void clone_slice(H264Context *dst, H264Context *src)
3549 {
3550     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3551     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3552     dst->s.current_picture      = src->s.current_picture;
3553     dst->s.linesize             = src->s.linesize;
3554     dst->s.uvlinesize           = src->s.uvlinesize;
3555     dst->s.first_field          = src->s.first_field;
3556
3557     dst->prev_poc_msb           = src->prev_poc_msb;
3558     dst->prev_poc_lsb           = src->prev_poc_lsb;
3559     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3560     dst->prev_frame_num         = src->prev_frame_num;
3561     dst->short_ref_count        = src->short_ref_count;
3562
3563     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3564     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3565     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3566     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3567
3568     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3569     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3570 }
3571
3572 /**
3573  * decodes a slice header.
3574  * This will also call MPV_common_init() and frame_start() as needed.
3575  *
3576  * @param h h264context
3577  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3578  *
3579  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3580  */
3581 static int decode_slice_header(H264Context *h, H264Context *h0){
3582     MpegEncContext * const s = &h->s;
3583     MpegEncContext * const s0 = &h0->s;
3584     unsigned int first_mb_in_slice;
3585     unsigned int pps_id;
3586     int num_ref_idx_active_override_flag;
3587     unsigned int slice_type, tmp, i, j;
3588     int default_ref_list_done = 0;
3589     int last_pic_structure;
3590
3591     s->dropable= h->nal_ref_idc == 0;
3592
3593     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3594         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3595         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3596     }else{
3597         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3598         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3599     }
3600
3601     first_mb_in_slice= get_ue_golomb(&s->gb);
3602
3603     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3604         h0->current_slice = 0;
3605         if (!s0->first_field)
3606             s->current_picture_ptr= NULL;
3607     }
3608
3609     slice_type= get_ue_golomb(&s->gb);
3610     if(slice_type > 9){
3611         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3612         return -1;
3613     }
3614     if(slice_type > 4){
3615         slice_type -= 5;
3616         h->slice_type_fixed=1;
3617     }else
3618         h->slice_type_fixed=0;
3619
3620     slice_type= golomb_to_pict_type[ slice_type ];
3621     if (slice_type == FF_I_TYPE
3622         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3623         default_ref_list_done = 1;
3624     }
3625     h->slice_type= slice_type;
3626     h->slice_type_nos= slice_type & 3;
3627
3628     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3629     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3630         av_log(h->s.avctx, AV_LOG_ERROR,
3631                "B picture before any references, skipping\n");
3632         return -1;
3633     }
3634
3635     pps_id= get_ue_golomb(&s->gb);
3636     if(pps_id>=MAX_PPS_COUNT){
3637         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3638         return -1;
3639     }
3640     if(!h0->pps_buffers[pps_id]) {
3641         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3642         return -1;
3643     }
3644     h->pps= *h0->pps_buffers[pps_id];
3645
3646     if(!h0->sps_buffers[h->pps.sps_id]) {
3647         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3648         return -1;
3649     }
3650     h->sps = *h0->sps_buffers[h->pps.sps_id];
3651
3652     if(h == h0 && h->dequant_coeff_pps != pps_id){
3653         h->dequant_coeff_pps = pps_id;
3654         init_dequant_tables(h);
3655     }
3656
3657     s->mb_width= h->sps.mb_width;
3658     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3659
3660     h->b_stride=  s->mb_width*4;
3661     h->b8_stride= s->mb_width*2;
3662
3663     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3664     if(h->sps.frame_mbs_only_flag)
3665         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3666     else
3667         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3668
3669     if (s->context_initialized
3670         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3671         if(h != h0)
3672             return -1;   // width / height changed during parallelized decoding
3673         free_tables(h);
3674         flush_dpb(s->avctx);
3675         MPV_common_end(s);
3676     }
3677     if (!s->context_initialized) {
3678         if(h != h0)
3679             return -1;  // we cant (re-)initialize context during parallel decoding
3680         if (MPV_common_init(s) < 0)
3681             return -1;
3682         s->first_field = 0;
3683
3684         init_scan_tables(h);
3685         alloc_tables(h);
3686
3687         for(i = 1; i < s->avctx->thread_count; i++) {
3688             H264Context *c;
3689             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3690             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3691             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3692             c->sps = h->sps;
3693             c->pps = h->pps;
3694             init_scan_tables(c);
3695             clone_tables(c, h);
3696         }
3697
3698         for(i = 0; i < s->avctx->thread_count; i++)
3699             if(context_init(h->thread_context[i]) < 0)
3700                 return -1;
3701
3702         s->avctx->width = s->width;
3703         s->avctx->height = s->height;
3704         s->avctx->sample_aspect_ratio= h->sps.sar;
3705         if(!s->avctx->sample_aspect_ratio.den)
3706             s->avctx->sample_aspect_ratio.den = 1;
3707
3708         if(h->sps.timing_info_present_flag){
3709             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3710             if(h->x264_build > 0 && h->x264_build < 44)
3711                 s->avctx->time_base.den *= 2;
3712             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3713                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3714         }
3715     }
3716
3717     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3718
3719     h->mb_mbaff = 0;
3720     h->mb_aff_frame = 0;
3721     last_pic_structure = s0->picture_structure;
3722     if(h->sps.frame_mbs_only_flag){
3723         s->picture_structure= PICT_FRAME;
3724     }else{
3725         if(get_bits1(&s->gb)) { //field_pic_flag
3726             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3727         } else {
3728             s->picture_structure= PICT_FRAME;
3729             h->mb_aff_frame = h->sps.mb_aff;
3730         }
3731     }
3732     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3733
3734     if(h0->current_slice == 0){
3735         while(h->frame_num !=  h->prev_frame_num &&
3736               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3737             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3738             frame_start(h);
3739             h->prev_frame_num++;
3740             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3741             s->current_picture_ptr->frame_num= h->prev_frame_num;
3742             execute_ref_pic_marking(h, NULL, 0);
3743         }
3744
3745         /* See if we have a decoded first field looking for a pair... */
3746         if (s0->first_field) {
3747             assert(s0->current_picture_ptr);
3748             assert(s0->current_picture_ptr->data[0]);
3749             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3750
3751             /* figure out if we have a complementary field pair */
3752             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3753                 /*
3754                  * Previous field is unmatched. Don't display it, but let it
3755                  * remain for reference if marked as such.
3756                  */
3757                 s0->current_picture_ptr = NULL;
3758                 s0->first_field = FIELD_PICTURE;
3759
3760             } else {
3761                 if (h->nal_ref_idc &&
3762                         s0->current_picture_ptr->reference &&
3763                         s0->current_picture_ptr->frame_num != h->frame_num) {
3764                     /*
3765                      * This and previous field were reference, but had
3766                      * different frame_nums. Consider this field first in
3767                      * pair. Throw away previous field except for reference
3768                      * purposes.
3769                      */
3770                     s0->first_field = 1;
3771                     s0->current_picture_ptr = NULL;
3772
3773                 } else {
3774                     /* Second field in complementary pair */
3775                     s0->first_field = 0;
3776                 }
3777             }
3778
3779         } else {
3780             /* Frame or first field in a potentially complementary pair */
3781             assert(!s0->current_picture_ptr);
3782             s0->first_field = FIELD_PICTURE;
3783         }
3784
3785         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3786             s0->first_field = 0;
3787             return -1;
3788         }
3789     }
3790     if(h != h0)
3791         clone_slice(h, h0);
3792
3793     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3794
3795     assert(s->mb_num == s->mb_width * s->mb_height);
3796     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3797        first_mb_in_slice                    >= s->mb_num){
3798         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3799         return -1;
3800     }
3801     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3802     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3803     if (s->picture_structure == PICT_BOTTOM_FIELD)
3804         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3805     assert(s->mb_y < s->mb_height);
3806
3807     if(s->picture_structure==PICT_FRAME){
3808         h->curr_pic_num=   h->frame_num;
3809         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3810     }else{
3811         h->curr_pic_num= 2*h->frame_num + 1;
3812         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3813     }
3814
3815     if(h->nal_unit_type == NAL_IDR_SLICE){
3816         get_ue_golomb(&s->gb); /* idr_pic_id */
3817     }
3818
3819     if(h->sps.poc_type==0){
3820         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3821
3822         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3823             h->delta_poc_bottom= get_se_golomb(&s->gb);
3824         }
3825     }
3826
3827     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3828         h->delta_poc[0]= get_se_golomb(&s->gb);
3829
3830         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3831             h->delta_poc[1]= get_se_golomb(&s->gb);
3832     }
3833
3834     init_poc(h);
3835
3836     if(h->pps.redundant_pic_cnt_present){
3837         h->redundant_pic_count= get_ue_golomb(&s->gb);
3838     }
3839
3840     //set defaults, might be overridden a few lines later
3841     h->ref_count[0]= h->pps.ref_count[0];
3842     h->ref_count[1]= h->pps.ref_count[1];
3843
3844     if(h->slice_type_nos != FF_I_TYPE){
3845         if(h->slice_type_nos == FF_B_TYPE){
3846             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3847         }
3848         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3849
3850         if(num_ref_idx_active_override_flag){
3851             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3852             if(h->slice_type_nos==FF_B_TYPE)
3853                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3854
3855             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3856                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3857                 h->ref_count[0]= h->ref_count[1]= 1;
3858                 return -1;
3859             }
3860         }
3861         if(h->slice_type_nos == FF_B_TYPE)
3862             h->list_count= 2;
3863         else
3864             h->list_count= 1;
3865     }else
3866         h->list_count= 0;
3867
3868     if(!default_ref_list_done){
3869         fill_default_ref_list(h);
3870     }
3871
3872     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3873         return -1;
3874
3875     if(h->slice_type_nos!=FF_I_TYPE){
3876         s->last_picture_ptr= &h->ref_list[0][0];
3877         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3878     }
3879     if(h->slice_type_nos==FF_B_TYPE){
3880         s->next_picture_ptr= &h->ref_list[1][0];
3881         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3882     }
3883
3884     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3885        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3886         pred_weight_table(h);
3887     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3888         implicit_weight_table(h);
3889     else
3890         h->use_weight = 0;
3891
3892     if(h->nal_ref_idc)
3893         decode_ref_pic_marking(h0, &s->gb);
3894
3895     if(FRAME_MBAFF)
3896         fill_mbaff_ref_list(h);
3897
3898     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3899         direct_dist_scale_factor(h);
3900     direct_ref_list_init(h);
3901
3902     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3903         tmp = get_ue_golomb(&s->gb);
3904         if(tmp > 2){
3905             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3906             return -1;
3907         }
3908         h->cabac_init_idc= tmp;
3909     }
3910
3911     h->last_qscale_diff = 0;
3912     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3913     if(tmp>51){
3914         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3915         return -1;
3916     }
3917     s->qscale= tmp;
3918     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3919     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3920     //FIXME qscale / qp ... stuff
3921     if(h->slice_type == FF_SP_TYPE){
3922         get_bits1(&s->gb); /* sp_for_switch_flag */
3923     }
3924     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3925         get_se_golomb(&s->gb); /* slice_qs_delta */
3926     }
3927
3928     h->deblocking_filter = 1;
3929     h->slice_alpha_c0_offset = 0;
3930     h->slice_beta_offset = 0;
3931     if( h->pps.deblocking_filter_parameters_present ) {
3932         tmp= get_ue_golomb(&s->gb);
3933         if(tmp > 2){
3934             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3935             return -1;
3936         }
3937         h->deblocking_filter= tmp;
3938         if(h->deblocking_filter < 2)
3939             h->deblocking_filter^= 1; // 1<->0
3940
3941         if( h->deblocking_filter ) {
3942             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3943             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3944         }
3945     }
3946
3947     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3948        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3949        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3950        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3951         h->deblocking_filter= 0;
3952
3953     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3954         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3955             /* Cheat slightly for speed:
3956                Do not bother to deblock across slices. */
3957             h->deblocking_filter = 2;
3958         } else {
3959             h0->max_contexts = 1;
3960             if(!h0->single_decode_warning) {
3961                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3962                 h0->single_decode_warning = 1;
3963             }
3964             if(h != h0)
3965                 return 1; // deblocking switched inside frame
3966         }
3967     }
3968
3969 #if 0 //FMO
3970     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3971         slice_group_change_cycle= get_bits(&s->gb, ?);
3972 #endif
3973
3974     h0->last_slice_type = slice_type;
3975     h->slice_num = ++h0->current_slice;
3976     if(h->slice_num >= MAX_SLICES){
3977         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3978     }
3979
3980     for(j=0; j<2; j++){
3981         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3982         ref2frm[0]=
3983         ref2frm[1]= -1;
3984         for(i=0; i<16; i++)
3985             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3986                           +(h->ref_list[j][i].reference&3);
3987         ref2frm[18+0]=
3988         ref2frm[18+1]= -1;
3989         for(i=16; i<48; i++)
3990             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3991                           +(h->ref_list[j][i].reference&3);
3992     }
3993
3994     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3995     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3996
3997     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3998         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
3999                h->slice_num,
4000                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4001                first_mb_in_slice,
4002                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4003                pps_id, h->frame_num,
4004                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4005                h->ref_count[0], h->ref_count[1],
4006                s->qscale,
4007                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4008                h->use_weight,
4009                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4010                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4011                );
4012     }
4013
4014     return 0;
4015 }
4016
4017 /**
4018  *
4019  */
4020 static inline int get_level_prefix(GetBitContext *gb){
4021     unsigned int buf;
4022     int log;
4023
4024     OPEN_READER(re, gb);
4025     UPDATE_CACHE(re, gb);
4026     buf=GET_CACHE(re, gb);
4027
4028     log= 32 - av_log2(buf);
4029 #ifdef TRACE
4030     print_bin(buf>>(32-log), log);
4031     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4032 #endif
4033
4034     LAST_SKIP_BITS(re, gb, log);
4035     CLOSE_READER(re, gb);
4036
4037     return log-1;
4038 }
4039
4040 static inline int get_dct8x8_allowed(H264Context *h){
4041     int i;
4042     for(i=0; i<4; i++){
4043         if(!IS_SUB_8X8(h->sub_mb_type[i])
4044            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4045             return 0;
4046     }
4047     return 1;
4048 }
4049
4050 /**
4051  * decodes a residual block.
4052  * @param n block index
4053  * @param scantable scantable
4054  * @param max_coeff number of coefficients in the block
4055  * @return <0 if an error occurred
4056  */
4057 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4058     MpegEncContext * const s = &h->s;
4059     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4060     int level[16];
4061     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4062
4063     //FIXME put trailing_onex into the context
4064
4065     if(n == CHROMA_DC_BLOCK_INDEX){
4066         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4067         total_coeff= coeff_token>>2;
4068     }else{
4069         if(n == LUMA_DC_BLOCK_INDEX){
4070             total_coeff= pred_non_zero_count(h, 0);
4071             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4072             total_coeff= coeff_token>>2;
4073         }else{
4074             total_coeff= pred_non_zero_count(h, n);
4075             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4076             total_coeff= coeff_token>>2;
4077             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4078         }
4079     }
4080
4081     //FIXME set last_non_zero?
4082
4083     if(total_coeff==0)
4084         return 0;
4085     if(total_coeff > (unsigned)max_coeff) {
4086         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4087         return -1;
4088     }
4089
4090     trailing_ones= coeff_token&3;
4091     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4092     assert(total_coeff<=16);
4093
4094     i = show_bits(gb, 3);
4095     skip_bits(gb, trailing_ones);
4096     level[0] = 1-((i&4)>>1);
4097     level[1] = 1-((i&2)   );
4098     level[2] = 1-((i&1)<<1);
4099
4100     if(trailing_ones<total_coeff) {
4101         int level_code, mask;
4102         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4103         int prefix= get_level_prefix(gb);
4104
4105         //first coefficient has suffix_length equal to 0 or 1
4106         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4107             if(suffix_length)
4108                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4109             else
4110                 level_code= (prefix<<suffix_length); //part
4111         }else if(prefix==14){
4112             if(suffix_length)
4113                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4114             else
4115                 level_code= prefix + get_bits(gb, 4); //part
4116         }else{
4117             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4118             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4119             if(prefix>=16)
4120                 level_code += (1<<(prefix-3))-4096;
4121         }
4122
4123         if(trailing_ones < 3) level_code += 2;
4124
4125         suffix_length = 1;
4126         if(level_code > 5)
4127             suffix_length++;
4128         mask= -(level_code&1);
4129         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4130
4131         //remaining coefficients have suffix_length > 0
4132         for(i=trailing_ones+1;i<total_coeff;i++) {
4133             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4134             prefix = get_level_prefix(gb);
4135             if(prefix<15){
4136                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4137             }else{
4138                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4139                 if(prefix>=16)
4140                     level_code += (1<<(prefix-3))-4096;
4141             }
4142             mask= -(level_code&1);
4143             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4144             if(level_code > suffix_limit[suffix_length])
4145                 suffix_length++;
4146         }
4147     }
4148
4149     if(total_coeff == max_coeff)
4150         zeros_left=0;
4151     else{
4152         if(n == CHROMA_DC_BLOCK_INDEX)
4153             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4154         else
4155             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4156     }
4157
4158     coeff_num = zeros_left + total_coeff - 1;
4159     j = scantable[coeff_num];
4160     if(n > 24){
4161         block[j] = level[0];
4162         for(i=1;i<total_coeff;i++) {
4163             if(zeros_left <= 0)
4164                 run_before = 0;
4165             else if(zeros_left < 7){
4166                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4167             }else{
4168                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4169             }
4170             zeros_left -= run_before;
4171             coeff_num -= 1 + run_before;
4172             j= scantable[ coeff_num ];
4173
4174             block[j]= level[i];
4175         }
4176     }else{
4177         block[j] = (level[0] * qmul[j] + 32)>>6;
4178         for(i=1;i<total_coeff;i++) {
4179             if(zeros_left <= 0)
4180                 run_before = 0;
4181             else if(zeros_left < 7){
4182                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4183             }else{
4184                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4185             }
4186             zeros_left -= run_before;
4187             coeff_num -= 1 + run_before;
4188             j= scantable[ coeff_num ];
4189
4190             block[j]= (level[i] * qmul[j] + 32)>>6;
4191         }
4192     }
4193
4194     if(zeros_left<0){
4195         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4196         return -1;
4197     }
4198
4199     return 0;
4200 }
4201
4202 static void predict_field_decoding_flag(H264Context *h){
4203     MpegEncContext * const s = &h->s;
4204     const int mb_xy= h->mb_xy;
4205     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4206                 ? s->current_picture.mb_type[mb_xy-1]
4207                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4208                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4209                 : 0;
4210     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4211 }
4212
4213 /**
4214  * decodes a P_SKIP or B_SKIP macroblock
4215  */
4216 static void decode_mb_skip(H264Context *h){
4217     MpegEncContext * const s = &h->s;
4218     const int mb_xy= h->mb_xy;
4219     int mb_type=0;
4220
4221     memset(h->non_zero_count[mb_xy], 0, 16);
4222     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4223
4224     if(MB_FIELD)
4225         mb_type|= MB_TYPE_INTERLACED;
4226
4227     if( h->slice_type_nos == FF_B_TYPE )
4228     {
4229         // just for fill_caches. pred_direct_motion will set the real mb_type
4230         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4231
4232         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4233         pred_direct_motion(h, &mb_type);
4234         mb_type|= MB_TYPE_SKIP;
4235     }
4236     else
4237     {
4238         int mx, my;
4239         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4240
4241         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4242         pred_pskip_motion(h, &mx, &my);
4243         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4244         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4245     }
4246
4247     write_back_motion(h, mb_type);
4248     s->current_picture.mb_type[mb_xy]= mb_type;
4249     s->current_picture.qscale_table[mb_xy]= s->qscale;
4250     h->slice_table[ mb_xy ]= h->slice_num;
4251     h->prev_mb_skipped= 1;
4252 }
4253
4254 /**
4255  * decodes a macroblock
4256  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4257  */
4258 static int decode_mb_cavlc(H264Context *h){
4259     MpegEncContext * const s = &h->s;
4260     int mb_xy;
4261     int partition_count;
4262     unsigned int mb_type, cbp;
4263     int dct8x8_allowed= h->pps.transform_8x8_mode;
4264
4265     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4266
4267     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4268
4269     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4270     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4271                 down the code */
4272     if(h->slice_type_nos != FF_I_TYPE){
4273         if(s->mb_skip_run==-1)
4274             s->mb_skip_run= get_ue_golomb(&s->gb);
4275
4276         if (s->mb_skip_run--) {
4277             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4278                 if(s->mb_skip_run==0)
4279                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4280                 else
4281                     predict_field_decoding_flag(h);
4282             }
4283             decode_mb_skip(h);
4284             return 0;
4285         }
4286     }
4287     if(FRAME_MBAFF){
4288         if( (s->mb_y&1) == 0 )
4289             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4290     }
4291
4292     h->prev_mb_skipped= 0;
4293
4294     mb_type= get_ue_golomb(&s->gb);
4295     if(h->slice_type_nos == FF_B_TYPE){
4296         if(mb_type < 23){
4297             partition_count= b_mb_type_info[mb_type].partition_count;
4298             mb_type=         b_mb_type_info[mb_type].type;
4299         }else{
4300             mb_type -= 23;
4301             goto decode_intra_mb;
4302         }
4303     }else if(h->slice_type_nos == FF_P_TYPE){
4304         if(mb_type < 5){
4305             partition_count= p_mb_type_info[mb_type].partition_count;
4306             mb_type=         p_mb_type_info[mb_type].type;
4307         }else{
4308             mb_type -= 5;
4309             goto decode_intra_mb;
4310         }
4311     }else{
4312        assert(h->slice_type_nos == FF_I_TYPE);
4313         if(h->slice_type == FF_SI_TYPE && mb_type)
4314             mb_type--;
4315 decode_intra_mb:
4316         if(mb_type > 25){
4317             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4318             return -1;
4319         }
4320         partition_count=0;
4321         cbp= i_mb_type_info[mb_type].cbp;
4322         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4323         mb_type= i_mb_type_info[mb_type].type;
4324     }
4325
4326     if(MB_FIELD)
4327         mb_type |= MB_TYPE_INTERLACED;
4328
4329     h->slice_table[ mb_xy ]= h->slice_num;
4330
4331     if(IS_INTRA_PCM(mb_type)){
4332         unsigned int x;
4333
4334         // We assume these blocks are very rare so we do not optimize it.
4335         align_get_bits(&s->gb);
4336
4337         // The pixels are stored in the same order as levels in h->mb array.
4338         for(x=0; x < (CHROMA ? 384 : 256); x++){
4339             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4340         }
4341
4342         // In deblocking, the quantizer is 0
4343         s->current_picture.qscale_table[mb_xy]= 0;
4344         // All coeffs are present
4345         memset(h->non_zero_count[mb_xy], 16, 16);
4346
4347         s->current_picture.mb_type[mb_xy]= mb_type;
4348         return 0;
4349     }
4350
4351     if(MB_MBAFF){
4352         h->ref_count[0] <<= 1;
4353         h->ref_count[1] <<= 1;
4354     }
4355
4356     fill_caches(h, mb_type, 0);
4357
4358     //mb_pred
4359     if(IS_INTRA(mb_type)){
4360         int pred_mode;
4361 //            init_top_left_availability(h);
4362         if(IS_INTRA4x4(mb_type)){
4363             int i;
4364             int di = 1;
4365             if(dct8x8_allowed && get_bits1(&s->gb)){
4366                 mb_type |= MB_TYPE_8x8DCT;
4367                 di = 4;
4368             }
4369
4370 //                fill_intra4x4_pred_table(h);
4371             for(i=0; i<16; i+=di){
4372                 int mode= pred_intra_mode(h, i);
4373
4374                 if(!get_bits1(&s->gb)){
4375                     const int rem_mode= get_bits(&s->gb, 3);
4376                     mode = rem_mode + (rem_mode >= mode);
4377                 }
4378
4379                 if(di==4)
4380                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4381                 else
4382                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4383             }
4384             write_back_intra_pred_mode(h);
4385             if( check_intra4x4_pred_mode(h) < 0)
4386                 return -1;
4387         }else{
4388             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4389             if(h->intra16x16_pred_mode < 0)
4390                 return -1;
4391         }
4392         if(CHROMA){
4393             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4394             if(pred_mode < 0)
4395                 return -1;
4396             h->chroma_pred_mode= pred_mode;
4397         }
4398     }else if(partition_count==4){
4399         int i, j, sub_partition_count[4], list, ref[2][4];
4400
4401         if(h->slice_type_nos == FF_B_TYPE){
4402             for(i=0; i<4; i++){
4403                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4404                 if(h->sub_mb_type[i] >=13){
4405                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4406                     return -1;
4407                 }
4408                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4409                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4410             }
4411             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4412                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4413                 pred_direct_motion(h, &mb_type);
4414                 h->ref_cache[0][scan8[4]] =
4415                 h->ref_cache[1][scan8[4]] =
4416                 h->ref_cache[0][scan8[12]] =
4417                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4418             }
4419         }else{
4420             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4421             for(i=0; i<4; i++){
4422                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4423                 if(h->sub_mb_type[i] >=4){
4424                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4425                     return -1;
4426                 }
4427                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4428                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4429             }
4430         }
4431
4432         for(list=0; list<h->list_count; list++){
4433             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4434             for(i=0; i<4; i++){
4435                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4436                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4437                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4438                     if(tmp>=ref_count){
4439                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4440                         return -1;
4441                     }
4442                     ref[list][i]= tmp;
4443                 }else{
4444                  //FIXME
4445                     ref[list][i] = -1;
4446                 }
4447             }
4448         }
4449
4450         if(dct8x8_allowed)
4451             dct8x8_allowed = get_dct8x8_allowed(h);
4452
4453         for(list=0; list<h->list_count; list++){
4454             for(i=0; i<4; i++){
4455                 if(IS_DIRECT(h->sub_mb_type[i])) {
4456                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4457                     continue;
4458                 }
4459                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4460                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4461
4462                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4463                     const int sub_mb_type= h->sub_mb_type[i];
4464                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4465                     for(j=0; j<sub_partition_count[i]; j++){
4466                         int mx, my;
4467                         const int index= 4*i + block_width*j;
4468                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4469                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4470                         mx += get_se_golomb(&s->gb);
4471                         my += get_se_golomb(&s->gb);
4472                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4473
4474                         if(IS_SUB_8X8(sub_mb_type)){
4475                             mv_cache[ 1 ][0]=
4476                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4477                             mv_cache[ 1 ][1]=
4478                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4479                         }else if(IS_SUB_8X4(sub_mb_type)){
4480                             mv_cache[ 1 ][0]= mx;
4481                             mv_cache[ 1 ][1]= my;
4482                         }else if(IS_SUB_4X8(sub_mb_type)){
4483                             mv_cache[ 8 ][0]= mx;
4484                             mv_cache[ 8 ][1]= my;
4485                         }
4486                         mv_cache[ 0 ][0]= mx;
4487                         mv_cache[ 0 ][1]= my;
4488                     }
4489                 }else{
4490                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4491                     p[0] = p[1]=
4492                     p[8] = p[9]= 0;
4493                 }
4494             }
4495         }
4496     }else if(IS_DIRECT(mb_type)){
4497         pred_direct_motion(h, &mb_type);
4498         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4499     }else{
4500         int list, mx, my, i;
4501          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4502         if(IS_16X16(mb_type)){
4503             for(list=0; list<h->list_count; list++){
4504                     unsigned int val;
4505                     if(IS_DIR(mb_type, 0, list)){
4506                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4507                         if(val >= h->ref_count[list]){
4508                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4509                             return -1;
4510                         }
4511                     }else
4512                         val= LIST_NOT_USED&0xFF;
4513                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4514             }
4515             for(list=0; list<h->list_count; list++){
4516                 unsigned int val;
4517                 if(IS_DIR(mb_type, 0, list)){
4518                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4519                     mx += get_se_golomb(&s->gb);
4520                     my += get_se_golomb(&s->gb);
4521                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4522
4523                     val= pack16to32(mx,my);
4524                 }else
4525                     val=0;
4526                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4527             }
4528         }
4529         else if(IS_16X8(mb_type)){
4530             for(list=0; list<h->list_count; list++){
4531                     for(i=0; i<2; i++){
4532                         unsigned int val;
4533                         if(IS_DIR(mb_type, i, list)){
4534                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4535                             if(val >= h->ref_count[list]){
4536                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4537                                 return -1;
4538                             }
4539                         }else
4540                             val= LIST_NOT_USED&0xFF;
4541                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4542                     }
4543             }
4544             for(list=0; list<h->list_count; list++){
4545                 for(i=0; i<2; i++){
4546                     unsigned int val;
4547                     if(IS_DIR(mb_type, i, list)){
4548                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4549                         mx += get_se_golomb(&s->gb);
4550                         my += get_se_golomb(&s->gb);
4551                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4552
4553                         val= pack16to32(mx,my);
4554                     }else
4555                         val=0;
4556                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4557                 }
4558             }
4559         }else{
4560             assert(IS_8X16(mb_type));
4561             for(list=0; list<h->list_count; list++){
4562                     for(i=0; i<2; i++){
4563                         unsigned int val;
4564                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4565                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4566                             if(val >= h->ref_count[list]){
4567                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4568                                 return -1;
4569                             }
4570                         }else
4571                             val= LIST_NOT_USED&0xFF;
4572                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4573                     }
4574             }
4575             for(list=0; list<h->list_count; list++){
4576                 for(i=0; i<2; i++){
4577                     unsigned int val;
4578                     if(IS_DIR(mb_type, i, list)){
4579                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4580                         mx += get_se_golomb(&s->gb);
4581                         my += get_se_golomb(&s->gb);
4582                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4583
4584                         val= pack16to32(mx,my);
4585                     }else
4586                         val=0;
4587                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4588                 }
4589             }
4590         }
4591     }
4592
4593     if(IS_INTER(mb_type))
4594         write_back_motion(h, mb_type);
4595
4596     if(!IS_INTRA16x16(mb_type)){
4597         cbp= get_ue_golomb(&s->gb);
4598         if(cbp > 47){
4599             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4600             return -1;
4601         }
4602
4603         if(CHROMA){
4604             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4605             else                     cbp= golomb_to_inter_cbp   [cbp];
4606         }else{
4607             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4608             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4609         }
4610     }
4611     h->cbp = cbp;
4612
4613     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4614         if(get_bits1(&s->gb)){
4615             mb_type |= MB_TYPE_8x8DCT;
4616             h->cbp_table[mb_xy]= cbp;
4617         }
4618     }
4619     s->current_picture.mb_type[mb_xy]= mb_type;
4620
4621     if(cbp || IS_INTRA16x16(mb_type)){
4622         int i8x8, i4x4, chroma_idx;
4623         int dquant;
4624         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4625         const uint8_t *scan, *scan8x8, *dc_scan;
4626
4627 //        fill_non_zero_count_cache(h);
4628
4629         if(IS_INTERLACED(mb_type)){
4630             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4631             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4632             dc_scan= luma_dc_field_scan;
4633         }else{
4634             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4635             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4636             dc_scan= luma_dc_zigzag_scan;
4637         }
4638
4639         dquant= get_se_golomb(&s->gb);
4640
4641         if( dquant > 25 || dquant < -26 ){
4642             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4643             return -1;
4644         }
4645
4646         s->qscale += dquant;
4647         if(((unsigned)s->qscale) > 51){
4648             if(s->qscale<0) s->qscale+= 52;
4649             else            s->qscale-= 52;
4650         }
4651
4652         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4653         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4654         if(IS_INTRA16x16(mb_type)){
4655             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4656                 return -1; //FIXME continue if partitioned and other return -1 too
4657             }
4658
4659             assert((cbp&15) == 0 || (cbp&15) == 15);
4660
4661             if(cbp&15){
4662                 for(i8x8=0; i8x8<4; i8x8++){
4663                     for(i4x4=0; i4x4<4; i4x4++){
4664                         const int index= i4x4 + 4*i8x8;
4665                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4666                             return -1;
4667                         }
4668                     }
4669                 }
4670             }else{
4671                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4672             }
4673         }else{
4674             for(i8x8=0; i8x8<4; i8x8++){
4675                 if(cbp & (1<<i8x8)){
4676                     if(IS_8x8DCT(mb_type)){
4677                         DCTELEM *buf = &h->mb[64*i8x8];
4678                         uint8_t *nnz;
4679                         for(i4x4=0; i4x4<4; i4x4++){
4680                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4681                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4682                                 return -1;
4683                         }
4684                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4685                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4686                     }else{
4687                         for(i4x4=0; i4x4<4; i4x4++){
4688                             const int index= i4x4 + 4*i8x8;
4689
4690                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4691                                 return -1;
4692                             }
4693                         }
4694                     }
4695                 }else{
4696                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4697                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4698                 }
4699             }
4700         }
4701
4702         if(cbp&0x30){
4703             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4704                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4705                     return -1;
4706                 }
4707         }
4708
4709         if(cbp&0x20){
4710             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4711                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4712                 for(i4x4=0; i4x4<4; i4x4++){
4713                     const int index= 16 + 4*chroma_idx + i4x4;
4714                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4715                         return -1;
4716                     }
4717                 }
4718             }
4719         }else{
4720             uint8_t * const nnz= &h->non_zero_count_cache[0];
4721             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4722             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4723         }
4724     }else{
4725         uint8_t * const nnz= &h->non_zero_count_cache[0];
4726         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4727         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4728         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4729     }
4730     s->current_picture.qscale_table[mb_xy]= s->qscale;
4731     write_back_non_zero_count(h);
4732
4733     if(MB_MBAFF){
4734         h->ref_count[0] >>= 1;
4735         h->ref_count[1] >>= 1;
4736     }
4737
4738     return 0;
4739 }
4740
4741 static int decode_cabac_field_decoding_flag(H264Context *h) {
4742     MpegEncContext * const s = &h->s;
4743     const int mb_x = s->mb_x;
4744     const int mb_y = s->mb_y & ~1;
4745     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4746     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4747
4748     unsigned int ctx = 0;
4749
4750     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4751         ctx += 1;
4752     }
4753     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4754         ctx += 1;
4755     }
4756
4757     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4758 }
4759
4760 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4761     uint8_t *state= &h->cabac_state[ctx_base];
4762     int mb_type;
4763
4764     if(intra_slice){
4765         MpegEncContext * const s = &h->s;
4766         const int mba_xy = h->left_mb_xy[0];
4767         const int mbb_xy = h->top_mb_xy;
4768         int ctx=0;
4769         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4770             ctx++;
4771         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4772             ctx++;
4773         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4774             return 0;   /* I4x4 */
4775         state += 2;
4776     }else{
4777         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4778             return 0;   /* I4x4 */
4779     }
4780
4781     if( get_cabac_terminate( &h->cabac ) )
4782         return 25;  /* PCM */
4783
4784     mb_type = 1; /* I16x16 */
4785     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4786     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4787         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4788     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4789     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4790     return mb_type;
4791 }
4792
4793 static int decode_cabac_mb_type( H264Context *h ) {
4794     MpegEncContext * const s = &h->s;
4795
4796     if( h->slice_type_nos == FF_I_TYPE ) {
4797         return decode_cabac_intra_mb_type(h, 3, 1);
4798     } else if( h->slice_type_nos == FF_P_TYPE ) {
4799         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4800             /* P-type */
4801             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4802                 /* P_L0_D16x16, P_8x8 */
4803                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4804             } else {
4805                 /* P_L0_D8x16, P_L0_D16x8 */
4806                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4807             }
4808         } else {
4809             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4810         }
4811     } else {
4812         const int mba_xy = h->left_mb_xy[0];
4813         const int mbb_xy = h->top_mb_xy;
4814         int ctx = 0;
4815         int bits;
4816         assert(h->slice_type_nos == FF_B_TYPE);
4817
4818         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4819             ctx++;
4820         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4821             ctx++;
4822
4823         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4824             return 0; /* B_Direct_16x16 */
4825
4826         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4827             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4828         }
4829
4830         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4831         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4832         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4833         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4834         if( bits < 8 )
4835             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4836         else if( bits == 13 ) {
4837             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4838         } else if( bits == 14 )
4839             return 11; /* B_L1_L0_8x16 */
4840         else if( bits == 15 )
4841             return 22; /* B_8x8 */
4842
4843         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4844         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4845     }
4846 }
4847
4848 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4849     MpegEncContext * const s = &h->s;
4850     int mba_xy, mbb_xy;
4851     int ctx = 0;
4852
4853     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4854         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4855         mba_xy = mb_xy - 1;
4856         if( (mb_y&1)
4857             && h->slice_table[mba_xy] == h->slice_num
4858             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4859             mba_xy += s->mb_stride;
4860         if( MB_FIELD ){
4861             mbb_xy = mb_xy - s->mb_stride;
4862             if( !(mb_y&1)
4863                 && h->slice_table[mbb_xy] == h->slice_num
4864                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4865                 mbb_xy -= s->mb_stride;
4866         }else
4867             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4868     }else{
4869         int mb_xy = h->mb_xy;
4870         mba_xy = mb_xy - 1;
4871         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4872     }
4873
4874     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4875         ctx++;
4876     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4877         ctx++;
4878
4879     if( h->slice_type_nos == FF_B_TYPE )
4880         ctx += 13;
4881     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4882 }
4883
4884 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4885     int mode = 0;
4886
4887     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4888         return pred_mode;
4889
4890     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4891     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4892     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4893
4894     if( mode >= pred_mode )
4895         return mode + 1;
4896     else
4897         return mode;
4898 }
4899
4900 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4901     const int mba_xy = h->left_mb_xy[0];
4902     const int mbb_xy = h->top_mb_xy;
4903
4904     int ctx = 0;
4905
4906     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4907     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4908         ctx++;
4909
4910     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4911         ctx++;
4912
4913     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4914         return 0;
4915
4916     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4917         return 1;
4918     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4919         return 2;
4920     else
4921         return 3;
4922 }
4923
4924 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4925     int cbp_b, cbp_a, ctx, cbp = 0;
4926
4927     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4928     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4929
4930     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4931     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4932     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4933     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4934     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4935     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4936     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4937     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4938     return cbp;
4939 }
4940 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4941     int ctx;
4942     int cbp_a, cbp_b;
4943
4944     cbp_a = (h->left_cbp>>4)&0x03;
4945     cbp_b = (h-> top_cbp>>4)&0x03;
4946
4947     ctx = 0;
4948     if( cbp_a > 0 ) ctx++;
4949     if( cbp_b > 0 ) ctx += 2;
4950     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4951         return 0;
4952
4953     ctx = 4;
4954     if( cbp_a == 2 ) ctx++;
4955     if( cbp_b == 2 ) ctx += 2;
4956     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4957 }
4958 static int decode_cabac_mb_dqp( H264Context *h) {
4959     int   ctx = 0;
4960     int   val = 0;
4961
4962     if( h->last_qscale_diff != 0 )
4963         ctx++;
4964
4965     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4966         if( ctx < 2 )
4967             ctx = 2;
4968         else
4969             ctx = 3;
4970         val++;
4971         if(val > 102) //prevent infinite loop
4972             return INT_MIN;
4973     }
4974
4975     if( val&0x01 )
4976         return (val + 1)/2;
4977     else
4978         return -(val + 1)/2;
4979 }
4980 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4981     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4982         return 0;   /* 8x8 */
4983     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4984         return 1;   /* 8x4 */
4985     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4986         return 2;   /* 4x8 */
4987     return 3;       /* 4x4 */
4988 }
4989 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4990     int type;
4991     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4992         return 0;   /* B_Direct_8x8 */
4993     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4994         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4995     type = 3;
4996     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4997         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4998             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4999         type += 4;
5000     }
5001     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5002     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5003     return type;
5004 }
5005
5006 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5007     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5008 }
5009
5010 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5011     int refa = h->ref_cache[list][scan8[n] - 1];
5012     int refb = h->ref_cache[list][scan8[n] - 8];
5013     int ref  = 0;
5014     int ctx  = 0;
5015
5016     if( h->slice_type_nos == FF_B_TYPE) {
5017         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5018             ctx++;
5019         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5020             ctx += 2;
5021     } else {
5022         if( refa > 0 )
5023             ctx++;
5024         if( refb > 0 )
5025             ctx += 2;
5026     }
5027
5028     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5029         ref++;
5030         if( ctx < 4 )
5031             ctx = 4;
5032         else
5033             ctx = 5;
5034         if(ref >= 32 /*h->ref_list[list]*/){
5035             return -1;
5036         }
5037     }
5038     return ref;
5039 }
5040
5041 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5042     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5043                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5044     int ctxbase = (l == 0) ? 40 : 47;
5045     int ctx, mvd;
5046
5047     if( amvd < 3 )
5048         ctx = 0;
5049     else if( amvd > 32 )
5050         ctx = 2;
5051     else
5052         ctx = 1;
5053
5054     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5055         return 0;
5056
5057     mvd= 1;
5058     ctx= 3;
5059     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5060         mvd++;
5061         if( ctx < 6 )
5062             ctx++;
5063     }
5064
5065     if( mvd >= 9 ) {
5066         int k = 3;
5067         while( get_cabac_bypass( &h->cabac ) ) {
5068             mvd += 1 << k;
5069             k++;
5070             if(k>24){
5071                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5072                 return INT_MIN;
5073             }
5074         }
5075         while( k-- ) {
5076             if( get_cabac_bypass( &h->cabac ) )
5077                 mvd += 1 << k;
5078         }
5079     }
5080     return get_cabac_bypass_sign( &h->cabac, -mvd );
5081 }
5082
5083 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5084     int nza, nzb;
5085     int ctx = 0;
5086
5087     if( is_dc ) {
5088         if( cat == 0 ) {
5089             nza = h->left_cbp&0x100;
5090             nzb = h-> top_cbp&0x100;
5091         } else {
5092             nza = (h->left_cbp>>(6+idx))&0x01;
5093             nzb = (h-> top_cbp>>(6+idx))&0x01;
5094         }
5095     } else {
5096         if( cat == 4 ) {
5097             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5098             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5099         } else {
5100             assert(cat == 1 || cat == 2);
5101             nza = h->non_zero_count_cache[scan8[idx] - 1];
5102             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5103         }
5104     }
5105
5106     if( nza > 0 )
5107         ctx++;
5108
5109     if( nzb > 0 )
5110         ctx += 2;
5111
5112     return ctx + 4 * cat;
5113 }
5114
5115 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5116     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5117     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5118     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5119     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5120 };
5121
5122 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5123     static const int significant_coeff_flag_offset[2][6] = {
5124       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5125       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5126     };
5127     static const int last_coeff_flag_offset[2][6] = {
5128       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5129       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5130     };
5131     static const int coeff_abs_level_m1_offset[6] = {
5132         227+0, 227+10, 227+20, 227+30, 227+39, 426
5133     };
5134     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5135       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5136         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5137         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5138        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5139       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5140         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5141         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5142         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5143     };
5144     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5145      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5146      * map node ctx => cabac ctx for level=1 */
5147     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5148     /* map node ctx => cabac ctx for level>1 */
5149     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5150     static const uint8_t coeff_abs_level_transition[2][8] = {
5151     /* update node ctx after decoding a level=1 */
5152         { 1, 2, 3, 3, 4, 5, 6, 7 },
5153     /* update node ctx after decoding a level>1 */
5154         { 4, 4, 4, 4, 5, 6, 7, 7 }
5155     };
5156
5157     int index[64];
5158
5159     int av_unused last;
5160     int coeff_count = 0;
5161     int node_ctx = 0;
5162
5163     uint8_t *significant_coeff_ctx_base;
5164     uint8_t *last_coeff_ctx_base;
5165     uint8_t *abs_level_m1_ctx_base;
5166
5167 #ifndef ARCH_X86
5168 #define CABAC_ON_STACK
5169 #endif
5170 #ifdef CABAC_ON_STACK
5171 #define CC &cc
5172     CABACContext cc;
5173     cc.range     = h->cabac.range;
5174     cc.low       = h->cabac.low;
5175     cc.bytestream= h->cabac.bytestream;
5176 #else
5177 #define CC &h->cabac
5178 #endif
5179
5180
5181     /* cat: 0-> DC 16x16  n = 0
5182      *      1-> AC 16x16  n = luma4x4idx
5183      *      2-> Luma4x4   n = luma4x4idx
5184      *      3-> DC Chroma n = iCbCr
5185      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5186      *      5-> Luma8x8   n = 4 * luma8x8idx
5187      */
5188
5189     /* read coded block flag */
5190     if( is_dc || cat != 5 ) {
5191         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5192             if( !is_dc ) {
5193                 if( cat == 4 )
5194                     h->non_zero_count_cache[scan8[16+n]] = 0;
5195                 else
5196                     h->non_zero_count_cache[scan8[n]] = 0;
5197             }
5198
5199 #ifdef CABAC_ON_STACK
5200             h->cabac.range     = cc.range     ;
5201             h->cabac.low       = cc.low       ;
5202             h->cabac.bytestream= cc.bytestream;
5203 #endif
5204             return;
5205         }
5206     }
5207
5208     significant_coeff_ctx_base = h->cabac_state
5209         + significant_coeff_flag_offset[MB_FIELD][cat];
5210     last_coeff_ctx_base = h->cabac_state
5211         + last_coeff_flag_offset[MB_FIELD][cat];
5212     abs_level_m1_ctx_base = h->cabac_state
5213         + coeff_abs_level_m1_offset[cat];
5214
5215     if( !is_dc && cat == 5 ) {
5216 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5217         for(last= 0; last < coefs; last++) { \
5218             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5219             if( get_cabac( CC, sig_ctx )) { \
5220                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5221                 index[coeff_count++] = last; \
5222                 if( get_cabac( CC, last_ctx ) ) { \
5223                     last= max_coeff; \
5224                     break; \
5225                 } \
5226             } \
5227         }\
5228         if( last == max_coeff -1 ) {\
5229             index[coeff_count++] = last;\
5230         }
5231         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5232 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5233         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5234     } else {
5235         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5236 #else
5237         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5238     } else {
5239         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5240 #endif
5241     }
5242     assert(coeff_count > 0);
5243
5244     if( is_dc ) {
5245         if( cat == 0 )
5246             h->cbp_table[h->mb_xy] |= 0x100;
5247         else
5248             h->cbp_table[h->mb_xy] |= 0x40 << n;
5249     } else {
5250         if( cat == 5 )
5251             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5252         else if( cat == 4 )
5253             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5254         else {
5255             assert( cat == 1 || cat == 2 );
5256             h->non_zero_count_cache[scan8[n]] = coeff_count;
5257         }
5258     }
5259
5260     do {
5261         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5262
5263         int j= scantable[index[--coeff_count]];
5264
5265         if( get_cabac( CC, ctx ) == 0 ) {
5266             node_ctx = coeff_abs_level_transition[0][node_ctx];
5267             if( is_dc ) {
5268                 block[j] = get_cabac_bypass_sign( CC, -1);
5269             }else{
5270                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5271             }
5272         } else {
5273             int coeff_abs = 2;
5274             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5275             node_ctx = coeff_abs_level_transition[1][node_ctx];
5276
5277             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5278                 coeff_abs++;
5279             }
5280
5281             if( coeff_abs >= 15 ) {
5282                 int j = 0;
5283                 while( get_cabac_bypass( CC ) ) {
5284                     j++;
5285                 }
5286
5287                 coeff_abs=1;
5288                 while( j-- ) {
5289                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5290                 }
5291                 coeff_abs+= 14;
5292             }
5293
5294             if( is_dc ) {
5295                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5296             }else{
5297                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5298             }
5299         }
5300     } while( coeff_count );
5301 #ifdef CABAC_ON_STACK
5302             h->cabac.range     = cc.range     ;
5303             h->cabac.low       = cc.low       ;
5304             h->cabac.bytestream= cc.bytestream;
5305 #endif
5306
5307 }
5308
5309 #ifndef CONFIG_SMALL
5310 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5311     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5312 }
5313
5314 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5315     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5316 }
5317 #endif
5318
5319 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5320 #ifdef CONFIG_SMALL
5321     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5322 #else
5323     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5324     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5325 #endif
5326 }
5327
5328 static inline void compute_mb_neighbors(H264Context *h)
5329 {
5330     MpegEncContext * const s = &h->s;
5331     const int mb_xy  = h->mb_xy;
5332     h->top_mb_xy     = mb_xy - s->mb_stride;
5333     h->left_mb_xy[0] = mb_xy - 1;
5334     if(FRAME_MBAFF){
5335         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5336         const int top_pair_xy      = pair_xy     - s->mb_stride;
5337         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5338         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5339         const int curr_mb_frame_flag = !MB_FIELD;
5340         const int bottom = (s->mb_y & 1);
5341         if (bottom
5342                 ? !curr_mb_frame_flag // bottom macroblock
5343                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5344                 ) {
5345             h->top_mb_xy -= s->mb_stride;
5346         }
5347         if (left_mb_frame_flag != curr_mb_frame_flag) {
5348             h->left_mb_xy[0] = pair_xy - 1;
5349         }
5350     } else if (FIELD_PICTURE) {
5351         h->top_mb_xy -= s->mb_stride;
5352     }
5353     return;
5354 }
5355
5356 /**
5357  * decodes a macroblock
5358  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5359  */
5360 static int decode_mb_cabac(H264Context *h) {
5361     MpegEncContext * const s = &h->s;
5362     int mb_xy;
5363     int mb_type, partition_count, cbp = 0;
5364     int dct8x8_allowed= h->pps.transform_8x8_mode;
5365
5366     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5367
5368     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5369
5370     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5371     if( h->slice_type_nos != FF_I_TYPE ) {
5372         int skip;
5373         /* a skipped mb needs the aff flag from the following mb */
5374         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5375             predict_field_decoding_flag(h);
5376         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5377             skip = h->next_mb_skipped;
5378         else
5379             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5380         /* read skip flags */
5381         if( skip ) {
5382             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5383                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5384                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5385                 if(h->next_mb_skipped)
5386                     predict_field_decoding_flag(h);
5387                 else
5388                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5389             }
5390
5391             decode_mb_skip(h);
5392
5393             h->cbp_table[mb_xy] = 0;
5394             h->chroma_pred_mode_table[mb_xy] = 0;
5395             h->last_qscale_diff = 0;
5396
5397             return 0;
5398
5399         }
5400     }
5401     if(FRAME_MBAFF){
5402         if( (s->mb_y&1) == 0 )
5403             h->mb_mbaff =
5404             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5405     }
5406
5407     h->prev_mb_skipped = 0;
5408
5409     compute_mb_neighbors(h);
5410     mb_type = decode_cabac_mb_type( h );
5411     assert(mb_type >= 0);
5412
5413     if( h->slice_type_nos == FF_B_TYPE ) {
5414         if( mb_type < 23 ){
5415             partition_count= b_mb_type_info[mb_type].partition_count;
5416             mb_type=         b_mb_type_info[mb_type].type;
5417         }else{
5418             mb_type -= 23;
5419             goto decode_intra_mb;
5420         }
5421     } else if( h->slice_type_nos == FF_P_TYPE ) {
5422         if( mb_type < 5) {
5423             partition_count= p_mb_type_info[mb_type].partition_count;
5424             mb_type=         p_mb_type_info[mb_type].type;
5425         } else {
5426             mb_type -= 5;
5427             goto decode_intra_mb;
5428         }
5429     } else {
5430         if(h->slice_type == FF_SI_TYPE && mb_type)
5431             mb_type--;
5432         assert(h->slice_type_nos == FF_I_TYPE);
5433 decode_intra_mb:
5434         partition_count = 0;
5435         cbp= i_mb_type_info[mb_type].cbp;
5436         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5437         mb_type= i_mb_type_info[mb_type].type;
5438     }
5439     if(MB_FIELD)
5440         mb_type |= MB_TYPE_INTERLACED;
5441
5442     h->slice_table[ mb_xy ]= h->slice_num;
5443
5444     if(IS_INTRA_PCM(mb_type)) {
5445         const uint8_t *ptr;
5446
5447         // We assume these blocks are very rare so we do not optimize it.
5448         // FIXME The two following lines get the bitstream position in the cabac
5449         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5450         ptr= h->cabac.bytestream;
5451         if(h->cabac.low&0x1) ptr--;
5452         if(CABAC_BITS==16){
5453             if(h->cabac.low&0x1FF) ptr--;
5454         }
5455
5456         // The pixels are stored in the same order as levels in h->mb array.
5457         memcpy(h->mb, ptr, 256); ptr+=256;
5458         if(CHROMA){
5459             memcpy(h->mb+128, ptr, 128); ptr+=128;
5460         }
5461
5462         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5463
5464         // All blocks are present
5465         h->cbp_table[mb_xy] = 0x1ef;
5466         h->chroma_pred_mode_table[mb_xy] = 0;
5467         // In deblocking, the quantizer is 0
5468         s->current_picture.qscale_table[mb_xy]= 0;
5469         // All coeffs are present
5470         memset(h->non_zero_count[mb_xy], 16, 16);
5471         s->current_picture.mb_type[mb_xy]= mb_type;
5472         h->last_qscale_diff = 0;
5473         return 0;
5474     }
5475
5476     if(MB_MBAFF){
5477         h->ref_count[0] <<= 1;
5478         h->ref_count[1] <<= 1;
5479     }
5480
5481     fill_caches(h, mb_type, 0);
5482
5483     if( IS_INTRA( mb_type ) ) {
5484         int i, pred_mode;
5485         if( IS_INTRA4x4( mb_type ) ) {
5486             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5487                 mb_type |= MB_TYPE_8x8DCT;
5488                 for( i = 0; i < 16; i+=4 ) {
5489                     int pred = pred_intra_mode( h, i );
5490                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5491                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5492                 }
5493             } else {
5494                 for( i = 0; i < 16; i++ ) {
5495                     int pred = pred_intra_mode( h, i );
5496                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5497
5498                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5499                 }
5500             }
5501             write_back_intra_pred_mode(h);
5502             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5503         } else {
5504             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5505             if( h->intra16x16_pred_mode < 0 ) return -1;
5506         }
5507         if(CHROMA){
5508             h->chroma_pred_mode_table[mb_xy] =
5509             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5510
5511             pred_mode= check_intra_pred_mode( h, pred_mode );
5512             if( pred_mode < 0 ) return -1;
5513             h->chroma_pred_mode= pred_mode;
5514         }
5515     } else if( partition_count == 4 ) {
5516         int i, j, sub_partition_count[4], list, ref[2][4];
5517
5518         if( h->slice_type_nos == FF_B_TYPE ) {
5519             for( i = 0; i < 4; i++ ) {
5520                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5521                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5522                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5523             }
5524             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5525                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5526                 pred_direct_motion(h, &mb_type);
5527                 h->ref_cache[0][scan8[4]] =
5528                 h->ref_cache[1][scan8[4]] =
5529                 h->ref_cache[0][scan8[12]] =
5530                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5531                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5532                     for( i = 0; i < 4; i++ )
5533                         if( IS_DIRECT(h->sub_mb_type[i]) )
5534                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5535                 }
5536             }
5537         } else {
5538             for( i = 0; i < 4; i++ ) {
5539                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5540                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5541                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5542             }
5543         }
5544
5545         for( list = 0; list < h->list_count; list++ ) {
5546                 for( i = 0; i < 4; i++ ) {
5547                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5548                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5549                         if( h->ref_count[list] > 1 ){
5550                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5551                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5552                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5553                                 return -1;
5554                             }
5555                         }else
5556                             ref[list][i] = 0;
5557                     } else {
5558                         ref[list][i] = -1;
5559                     }
5560                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5561                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5562                 }
5563         }
5564
5565         if(dct8x8_allowed)
5566             dct8x8_allowed = get_dct8x8_allowed(h);
5567
5568         for(list=0; list<h->list_count; list++){
5569             for(i=0; i<4; i++){
5570                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5571                 if(IS_DIRECT(h->sub_mb_type[i])){
5572                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5573                     continue;
5574                 }
5575
5576                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5577                     const int sub_mb_type= h->sub_mb_type[i];
5578                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5579                     for(j=0; j<sub_partition_count[i]; j++){
5580                         int mpx, mpy;
5581                         int mx, my;
5582                         const int index= 4*i + block_width*j;
5583                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5584                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5585                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5586
5587                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5588                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5589                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5590
5591                         if(IS_SUB_8X8(sub_mb_type)){
5592                             mv_cache[ 1 ][0]=
5593                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5594                             mv_cache[ 1 ][1]=
5595                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5596
5597                             mvd_cache[ 1 ][0]=
5598                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5599                             mvd_cache[ 1 ][1]=
5600                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5601                         }else if(IS_SUB_8X4(sub_mb_type)){
5602                             mv_cache[ 1 ][0]= mx;
5603                             mv_cache[ 1 ][1]= my;
5604
5605                             mvd_cache[ 1 ][0]= mx - mpx;
5606                             mvd_cache[ 1 ][1]= my - mpy;
5607                         }else if(IS_SUB_4X8(sub_mb_type)){
5608                             mv_cache[ 8 ][0]= mx;
5609                             mv_cache[ 8 ][1]= my;
5610
5611                             mvd_cache[ 8 ][0]= mx - mpx;
5612                             mvd_cache[ 8 ][1]= my - mpy;
5613                         }
5614                         mv_cache[ 0 ][0]= mx;
5615                         mv_cache[ 0 ][1]= my;
5616
5617                         mvd_cache[ 0 ][0]= mx - mpx;
5618                         mvd_cache[ 0 ][1]= my - mpy;
5619                     }
5620                 }else{
5621                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5622                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5623                     p[0] = p[1] = p[8] = p[9] = 0;
5624                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5625                 }
5626             }
5627         }
5628     } else if( IS_DIRECT(mb_type) ) {
5629         pred_direct_motion(h, &mb_type);
5630         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5631         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5632         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5633     } else {
5634         int list, mx, my, i, mpx, mpy;
5635         if(IS_16X16(mb_type)){
5636             for(list=0; list<h->list_count; list++){
5637                 if(IS_DIR(mb_type, 0, list)){
5638                     int ref;
5639                     if(h->ref_count[list] > 1){
5640                         ref= decode_cabac_mb_ref(h, list, 0);
5641                         if(ref >= (unsigned)h->ref_count[list]){
5642                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5643                             return -1;
5644                         }
5645                     }else
5646                         ref=0;
5647                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5648                 }else
5649                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5650             }
5651             for(list=0; list<h->list_count; list++){
5652                 if(IS_DIR(mb_type, 0, list)){
5653                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5654
5655                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5656                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5657                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5658
5659                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5660                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5661                 }else
5662                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5663             }
5664         }
5665         else if(IS_16X8(mb_type)){
5666             for(list=0; list<h->list_count; list++){
5667                     for(i=0; i<2; i++){
5668                         if(IS_DIR(mb_type, i, list)){
5669                             int ref;
5670                             if(h->ref_count[list] > 1){
5671                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5672                                 if(ref >= (unsigned)h->ref_count[list]){
5673                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5674                                     return -1;
5675                                 }
5676                             }else
5677                                 ref=0;
5678                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5679                         }else
5680                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5681                     }
5682             }
5683             for(list=0; list<h->list_count; list++){
5684                 for(i=0; i<2; i++){
5685                     if(IS_DIR(mb_type, i, list)){
5686                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5687                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5688                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5689                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5690
5691                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5692                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5693                     }else{
5694                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5695                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5696                     }
5697                 }
5698             }
5699         }else{
5700             assert(IS_8X16(mb_type));
5701             for(list=0; list<h->list_count; list++){
5702                     for(i=0; i<2; i++){
5703                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5704                             int ref;
5705                             if(h->ref_count[list] > 1){
5706                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5707                                 if(ref >= (unsigned)h->ref_count[list]){
5708                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5709                                     return -1;
5710                                 }
5711                             }else
5712                                 ref=0;
5713                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5714                         }else
5715                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5716                     }
5717             }
5718             for(list=0; list<h->list_count; list++){
5719                 for(i=0; i<2; i++){
5720                     if(IS_DIR(mb_type, i, list)){
5721                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5722                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5723                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5724
5725                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5726                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5727                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5728                     }else{
5729                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5730                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5731                     }
5732                 }
5733             }
5734         }
5735     }
5736
5737    if( IS_INTER( mb_type ) ) {
5738         h->chroma_pred_mode_table[mb_xy] = 0;
5739         write_back_motion( h, mb_type );
5740    }
5741
5742     if( !IS_INTRA16x16( mb_type ) ) {
5743         cbp  = decode_cabac_mb_cbp_luma( h );
5744         if(CHROMA)
5745             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5746     }
5747
5748     h->cbp_table[mb_xy] = h->cbp = cbp;
5749
5750     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5751         if( decode_cabac_mb_transform_size( h ) )
5752             mb_type |= MB_TYPE_8x8DCT;
5753     }
5754     s->current_picture.mb_type[mb_xy]= mb_type;
5755
5756     if( cbp || IS_INTRA16x16( mb_type ) ) {
5757         const uint8_t *scan, *scan8x8, *dc_scan;
5758         const uint32_t *qmul;
5759         int dqp;
5760
5761         if(IS_INTERLACED(mb_type)){
5762             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5763             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5764             dc_scan= luma_dc_field_scan;
5765         }else{
5766             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5767             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5768             dc_scan= luma_dc_zigzag_scan;
5769         }
5770
5771         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5772         if( dqp == INT_MIN ){
5773             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5774             return -1;
5775         }
5776         s->qscale += dqp;
5777         if(((unsigned)s->qscale) > 51){
5778             if(s->qscale<0) s->qscale+= 52;
5779             else            s->qscale-= 52;
5780         }
5781         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5782         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5783
5784         if( IS_INTRA16x16( mb_type ) ) {
5785             int i;
5786             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5787             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5788
5789             if( cbp&15 ) {
5790                 qmul = h->dequant4_coeff[0][s->qscale];
5791                 for( i = 0; i < 16; i++ ) {
5792                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5793                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5794                 }
5795             } else {
5796                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5797             }
5798         } else {
5799             int i8x8, i4x4;
5800             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5801                 if( cbp & (1<<i8x8) ) {
5802                     if( IS_8x8DCT(mb_type) ) {
5803                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5804                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5805                     } else {
5806                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5807                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5808                             const int index = 4*i8x8 + i4x4;
5809                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5810 //START_TIMER
5811                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5812 //STOP_TIMER("decode_residual")
5813                         }
5814                     }
5815                 } else {
5816                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5817                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5818                 }
5819             }
5820         }
5821
5822         if( cbp&0x30 ){
5823             int c;
5824             for( c = 0; c < 2; c++ ) {
5825                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5826                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5827             }
5828         }
5829
5830         if( cbp&0x20 ) {
5831             int c, i;
5832             for( c = 0; c < 2; c++ ) {
5833                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5834                 for( i = 0; i < 4; i++ ) {
5835                     const int index = 16 + 4 * c + i;
5836                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5837                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5838                 }
5839             }
5840         } else {
5841             uint8_t * const nnz= &h->non_zero_count_cache[0];
5842             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5843             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5844         }
5845     } else {
5846         uint8_t * const nnz= &h->non_zero_count_cache[0];
5847         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5848         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5849         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5850         h->last_qscale_diff = 0;
5851     }
5852
5853     s->current_picture.qscale_table[mb_xy]= s->qscale;
5854     write_back_non_zero_count(h);
5855
5856     if(MB_MBAFF){
5857         h->ref_count[0] >>= 1;
5858         h->ref_count[1] >>= 1;
5859     }
5860
5861     return 0;
5862 }
5863
5864
5865 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5866     int i, d;
5867     const int index_a = qp + h->slice_alpha_c0_offset;
5868     const int alpha = (alpha_table+52)[index_a];
5869     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5870
5871     if( bS[0] < 4 ) {
5872         int8_t tc[4];
5873         for(i=0; i<4; i++)
5874             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5875         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5876     } else {
5877         /* 16px edge length, because bS=4 is triggered by being at
5878          * the edge of an intra MB, so all 4 bS are the same */
5879             for( d = 0; d < 16; d++ ) {
5880                 const int p0 = pix[-1];
5881                 const int p1 = pix[-2];
5882                 const int p2 = pix[-3];
5883
5884                 const int q0 = pix[0];
5885                 const int q1 = pix[1];
5886                 const int q2 = pix[2];
5887
5888                 if( FFABS( p0 - q0 ) < alpha &&
5889                     FFABS( p1 - p0 ) < beta &&
5890                     FFABS( q1 - q0 ) < beta ) {
5891
5892                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5893                         if( FFABS( p2 - p0 ) < beta)
5894                         {
5895                             const int p3 = pix[-4];
5896                             /* p0', p1', p2' */
5897                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5898                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5899                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5900                         } else {
5901                             /* p0' */
5902                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5903                         }
5904                         if( FFABS( q2 - q0 ) < beta)
5905                         {
5906                             const int q3 = pix[3];
5907                             /* q0', q1', q2' */
5908                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5909                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5910                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5911                         } else {
5912                             /* q0' */
5913                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5914                         }
5915                     }else{
5916                         /* p0', q0' */
5917                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5918                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5919                     }
5920                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5921                 }
5922                 pix += stride;
5923             }
5924     }
5925 }
5926 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5927     int i;
5928     const int index_a = qp + h->slice_alpha_c0_offset;
5929     const int alpha = (alpha_table+52)[index_a];
5930     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5931
5932     if( bS[0] < 4 ) {
5933         int8_t tc[4];
5934         for(i=0; i<4; i++)
5935             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5936         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5937     } else {
5938         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5939     }
5940 }
5941
5942 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5943     int i;
5944     for( i = 0; i < 16; i++, pix += stride) {
5945         int index_a;
5946         int alpha;
5947         int beta;
5948
5949         int qp_index;
5950         int bS_index = (i >> 1);
5951         if (!MB_FIELD) {
5952             bS_index &= ~1;
5953             bS_index |= (i & 1);
5954         }
5955
5956         if( bS[bS_index] == 0 ) {
5957             continue;
5958         }
5959
5960         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5961         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5962         alpha = (alpha_table+52)[index_a];
5963         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5964
5965         if( bS[bS_index] < 4 ) {
5966             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5967             const int p0 = pix[-1];
5968             const int p1 = pix[-2];
5969             const int p2 = pix[-3];
5970             const int q0 = pix[0];
5971             const int q1 = pix[1];
5972             const int q2 = pix[2];
5973
5974             if( FFABS( p0 - q0 ) < alpha &&
5975                 FFABS( p1 - p0 ) < beta &&
5976                 FFABS( q1 - q0 ) < beta ) {
5977                 int tc = tc0;
5978                 int i_delta;
5979
5980                 if( FFABS( p2 - p0 ) < beta ) {
5981                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5982                     tc++;
5983                 }
5984                 if( FFABS( q2 - q0 ) < beta ) {
5985                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5986                     tc++;
5987                 }
5988
5989                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5990                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5991                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5992                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5993             }
5994         }else{
5995             const int p0 = pix[-1];
5996             const int p1 = pix[-2];
5997             const int p2 = pix[-3];
5998
5999             const int q0 = pix[0];
6000             const int q1 = pix[1];
6001             const int q2 = pix[2];
6002
6003             if( FFABS( p0 - q0 ) < alpha &&
6004                 FFABS( p1 - p0 ) < beta &&
6005                 FFABS( q1 - q0 ) < beta ) {
6006
6007                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6008                     if( FFABS( p2 - p0 ) < beta)
6009                     {
6010                         const int p3 = pix[-4];
6011                         /* p0', p1', p2' */
6012                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6013                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6014                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6015                     } else {
6016                         /* p0' */
6017                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6018                     }
6019                     if( FFABS( q2 - q0 ) < beta)
6020                     {
6021                         const int q3 = pix[3];
6022                         /* q0', q1', q2' */
6023                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6024                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6025                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6026                     } else {
6027                         /* q0' */
6028                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6029                     }
6030                 }else{
6031                     /* p0', q0' */
6032                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6033                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6034                 }
6035                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6036             }
6037         }
6038     }
6039 }
6040 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6041     int i;
6042     for( i = 0; i < 8; i++, pix += stride) {
6043         int index_a;
6044         int alpha;
6045         int beta;
6046
6047         int qp_index;
6048         int bS_index = i;
6049
6050         if( bS[bS_index] == 0 ) {
6051             continue;
6052         }
6053
6054         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6055         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6056         alpha = (alpha_table+52)[index_a];
6057         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6058
6059         if( bS[bS_index] < 4 ) {
6060             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6061             const int p0 = pix[-1];
6062             const int p1 = pix[-2];
6063             const int q0 = pix[0];
6064             const int q1 = pix[1];
6065
6066             if( FFABS( p0 - q0 ) < alpha &&
6067                 FFABS( p1 - p0 ) < beta &&
6068                 FFABS( q1 - q0 ) < beta ) {
6069                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6070
6071                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6072                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6073                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6074             }
6075         }else{
6076             const int p0 = pix[-1];
6077             const int p1 = pix[-2];
6078             const int q0 = pix[0];
6079             const int q1 = pix[1];
6080
6081             if( FFABS( p0 - q0 ) < alpha &&
6082                 FFABS( p1 - p0 ) < beta &&
6083                 FFABS( q1 - q0 ) < beta ) {
6084
6085                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6086                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6087                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6088             }
6089         }
6090     }
6091 }
6092
6093 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6094     int i, d;
6095     const int index_a = qp + h->slice_alpha_c0_offset;
6096     const int alpha = (alpha_table+52)[index_a];
6097     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6098     const int pix_next  = stride;
6099
6100     if( bS[0] < 4 ) {
6101         int8_t tc[4];
6102         for(i=0; i<4; i++)
6103             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6104         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6105     } else {
6106         /* 16px edge length, see filter_mb_edgev */
6107             for( d = 0; d < 16; d++ ) {
6108                 const int p0 = pix[-1*pix_next];
6109                 const int p1 = pix[-2*pix_next];
6110                 const int p2 = pix[-3*pix_next];
6111                 const int q0 = pix[0];
6112                 const int q1 = pix[1*pix_next];
6113                 const int q2 = pix[2*pix_next];
6114
6115                 if( FFABS( p0 - q0 ) < alpha &&
6116                     FFABS( p1 - p0 ) < beta &&
6117                     FFABS( q1 - q0 ) < beta ) {
6118
6119                     const int p3 = pix[-4*pix_next];
6120                     const int q3 = pix[ 3*pix_next];
6121
6122                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6123                         if( FFABS( p2 - p0 ) < beta) {
6124                             /* p0', p1', p2' */
6125                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6126                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6127                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6128                         } else {
6129                             /* p0' */
6130                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6131                         }
6132                         if( FFABS( q2 - q0 ) < beta) {
6133                             /* q0', q1', q2' */
6134                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6135                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6136                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6137                         } else {
6138                             /* q0' */
6139                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6140                         }
6141                     }else{
6142                         /* p0', q0' */
6143                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6144                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6145                     }
6146                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6147                 }
6148                 pix++;
6149             }
6150     }
6151 }
6152
6153 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6154     int i;
6155     const int index_a = qp + h->slice_alpha_c0_offset;
6156     const int alpha = (alpha_table+52)[index_a];
6157     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6158
6159     if( bS[0] < 4 ) {
6160         int8_t tc[4];
6161         for(i=0; i<4; i++)
6162             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6163         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6164     } else {
6165         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6166     }
6167 }
6168
6169 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6170     MpegEncContext * const s = &h->s;
6171     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6172     int mb_xy, mb_type;
6173     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6174
6175     mb_xy = h->mb_xy;
6176
6177     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6178         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6179        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6180                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6181         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6182         return;
6183     }
6184     assert(!FRAME_MBAFF);
6185
6186     mb_type = s->current_picture.mb_type[mb_xy];
6187     qp = s->current_picture.qscale_table[mb_xy];
6188     qp0 = s->current_picture.qscale_table[mb_xy-1];
6189     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6190     qpc = get_chroma_qp( h, 0, qp );
6191     qpc0 = get_chroma_qp( h, 0, qp0 );
6192     qpc1 = get_chroma_qp( h, 0, qp1 );
6193     qp0 = (qp + qp0 + 1) >> 1;
6194     qp1 = (qp + qp1 + 1) >> 1;
6195     qpc0 = (qpc + qpc0 + 1) >> 1;
6196     qpc1 = (qpc + qpc1 + 1) >> 1;
6197     qp_thresh = 15 - h->slice_alpha_c0_offset;
6198     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6199        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6200         return;
6201
6202     if( IS_INTRA(mb_type) ) {
6203         int16_t bS4[4] = {4,4,4,4};
6204         int16_t bS3[4] = {3,3,3,3};
6205         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6206         if( IS_8x8DCT(mb_type) ) {
6207             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6208             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6209             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6210             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6211         } else {
6212             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6213             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6214             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6215             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6216             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6217             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6218             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6219             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6220         }
6221         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6222         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6223         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6224         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6225         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6226         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6227         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6228         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6229         return;
6230     } else {
6231         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6232         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6233         int edges;
6234         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6235             edges = 4;
6236             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6237         } else {
6238             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6239                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6240             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6241                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6242                              ? 3 : 0;
6243             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6244             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6245             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6246                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6247         }
6248         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6249             bSv[0][0] = 0x0004000400040004ULL;
6250         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6251             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6252
6253 #define FILTER(hv,dir,edge)\
6254         if(bSv[dir][edge]) {\
6255             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6256             if(!(edge&1)) {\
6257                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6258                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6259             }\
6260         }
6261         if( edges == 1 ) {
6262             FILTER(v,0,0);
6263             FILTER(h,1,0);
6264         } else if( IS_8x8DCT(mb_type) ) {
6265             FILTER(v,0,0);
6266             FILTER(v,0,2);
6267             FILTER(h,1,0);
6268             FILTER(h,1,2);
6269         } else {
6270             FILTER(v,0,0);
6271             FILTER(v,0,1);
6272             FILTER(v,0,2);
6273             FILTER(v,0,3);
6274             FILTER(h,1,0);
6275             FILTER(h,1,1);
6276             FILTER(h,1,2);
6277             FILTER(h,1,3);
6278         }
6279 #undef FILTER
6280     }
6281 }
6282
6283
6284 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6285     MpegEncContext * const s = &h->s;
6286     int edge;
6287     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6288     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6289     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6290     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6291     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6292
6293     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6294                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6295     // how often to recheck mv-based bS when iterating between edges
6296     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6297                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6298     // how often to recheck mv-based bS when iterating along each edge
6299     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6300
6301     if (first_vertical_edge_done) {
6302         start = 1;
6303     }
6304
6305     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6306         start = 1;
6307
6308     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6309         && !IS_INTERLACED(mb_type)
6310         && IS_INTERLACED(mbm_type)
6311         ) {
6312         // This is a special case in the norm where the filtering must
6313         // be done twice (one each of the field) even if we are in a
6314         // frame macroblock.
6315         //
6316         static const int nnz_idx[4] = {4,5,6,3};
6317         unsigned int tmp_linesize   = 2 *   linesize;
6318         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6319         int mbn_xy = mb_xy - 2 * s->mb_stride;
6320         int qp;
6321         int i, j;
6322         int16_t bS[4];
6323
6324         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6325             if( IS_INTRA(mb_type) ||
6326                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6327                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6328             } else {
6329                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6330                 for( i = 0; i < 4; i++ ) {
6331                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6332                         mbn_nnz[nnz_idx[i]] != 0 )
6333                         bS[i] = 2;
6334                     else
6335                         bS[i] = 1;
6336                 }
6337             }
6338             // Do not use s->qscale as luma quantizer because it has not the same
6339             // value in IPCM macroblocks.
6340             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6341             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6342             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6343             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6344             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6345                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6346             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6347                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6348         }
6349
6350         start = 1;
6351     }
6352
6353     /* Calculate bS */
6354     for( edge = start; edge < edges; edge++ ) {
6355         /* mbn_xy: neighbor macroblock */
6356         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6357         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6358         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6359         int16_t bS[4];
6360         int qp;
6361
6362         if( (edge&1) && IS_8x8DCT(mb_type) )
6363             continue;
6364
6365         if( IS_INTRA(mb_type) ||
6366             IS_INTRA(mbn_type) ) {
6367             int value;
6368             if (edge == 0) {
6369                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6370                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6371                 ) {
6372                     value = 4;
6373                 } else {
6374                     value = 3;
6375                 }
6376             } else {
6377                 value = 3;
6378             }
6379             bS[0] = bS[1] = bS[2] = bS[3] = value;
6380         } else {
6381             int i, l;
6382             int mv_done;
6383
6384             if( edge & mask_edge ) {
6385                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6386                 mv_done = 1;
6387             }
6388             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6389                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6390                 mv_done = 1;
6391             }
6392             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6393                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6394                 int bn_idx= b_idx - (dir ? 8:1);
6395                 int v = 0;
6396
6397                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6398                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6399                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6400                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6401                 }
6402
6403                 if(h->slice_type_nos == FF_B_TYPE && v){
6404                     v=0;
6405                     for( l = 0; !v && l < 2; l++ ) {
6406                         int ln= 1-l;
6407                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6408                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6409                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6410                     }
6411                 }
6412
6413                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6414                 mv_done = 1;
6415             }
6416             else
6417                 mv_done = 0;
6418
6419             for( i = 0; i < 4; i++ ) {
6420                 int x = dir == 0 ? edge : i;
6421                 int y = dir == 0 ? i    : edge;
6422                 int b_idx= 8 + 4 + x + 8*y;
6423                 int bn_idx= b_idx - (dir ? 8:1);
6424
6425                 if( h->non_zero_count_cache[b_idx] |
6426                     h->non_zero_count_cache[bn_idx] ) {
6427                     bS[i] = 2;
6428                 }
6429                 else if(!mv_done)
6430                 {
6431                     bS[i] = 0;
6432                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6433                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6434                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6435                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6436                             bS[i] = 1;
6437                             break;
6438                         }
6439                     }
6440
6441                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6442                         bS[i] = 0;
6443                         for( l = 0; l < 2; l++ ) {
6444                             int ln= 1-l;
6445                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6446                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6447                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6448                                 bS[i] = 1;
6449                                 break;
6450                             }
6451                         }
6452                     }
6453                 }
6454             }
6455
6456             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6457                 continue;
6458         }
6459
6460         /* Filter edge */
6461         // Do not use s->qscale as luma quantizer because it has not the same
6462         // value in IPCM macroblocks.
6463         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6464         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6465         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6466         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6467         if( dir == 0 ) {
6468             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6469             if( (edge&1) == 0 ) {
6470                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6471                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6472                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6473                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6474             }
6475         } else {
6476             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6477             if( (edge&1) == 0 ) {
6478                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6479                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6480                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6481                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6482             }
6483         }
6484     }
6485 }
6486
6487 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6488     MpegEncContext * const s = &h->s;
6489     const int mb_xy= mb_x + mb_y*s->mb_stride;
6490     const int mb_type = s->current_picture.mb_type[mb_xy];
6491     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6492     int first_vertical_edge_done = 0;
6493     int dir;
6494
6495     //for sufficiently low qp, filtering wouldn't do anything
6496     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6497     if(!FRAME_MBAFF){
6498         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6499         int qp = s->current_picture.qscale_table[mb_xy];
6500         if(qp <= qp_thresh
6501            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6502            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6503             return;
6504         }
6505     }
6506
6507     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6508     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6509         int top_type, left_type[2];
6510         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6511         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6512         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6513
6514         if(IS_8x8DCT(top_type)){
6515             h->non_zero_count_cache[4+8*0]=
6516             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6517             h->non_zero_count_cache[6+8*0]=
6518             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6519         }
6520         if(IS_8x8DCT(left_type[0])){
6521             h->non_zero_count_cache[3+8*1]=
6522             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6523         }
6524         if(IS_8x8DCT(left_type[1])){
6525             h->non_zero_count_cache[3+8*3]=
6526             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6527         }
6528
6529         if(IS_8x8DCT(mb_type)){
6530             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6531             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6532
6533             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6534             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6535
6536             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6537             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6538
6539             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6540             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6541         }
6542     }
6543
6544     if (FRAME_MBAFF
6545             // left mb is in picture
6546             && h->slice_table[mb_xy-1] != 0xFFFF
6547             // and current and left pair do not have the same interlaced type
6548             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6549             // and left mb is in the same slice if deblocking_filter == 2
6550             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6551         /* First vertical edge is different in MBAFF frames
6552          * There are 8 different bS to compute and 2 different Qp
6553          */
6554         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6555         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6556         int16_t bS[8];
6557         int qp[2];
6558         int bqp[2];
6559         int rqp[2];
6560         int mb_qp, mbn0_qp, mbn1_qp;
6561         int i;
6562         first_vertical_edge_done = 1;
6563
6564         if( IS_INTRA(mb_type) )
6565             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6566         else {
6567             for( i = 0; i < 8; i++ ) {
6568                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6569
6570                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6571                     bS[i] = 4;
6572                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6573                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6574                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6575                                                                        :
6576                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6577                     bS[i] = 2;
6578                 else
6579                     bS[i] = 1;
6580             }
6581         }
6582
6583         mb_qp = s->current_picture.qscale_table[mb_xy];
6584         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6585         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6586         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6587         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6588                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6589         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6590                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6591         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6592         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6593                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6594         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6595                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6596
6597         /* Filter edge */
6598         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6599         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6600         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6601         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6602         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6603     }
6604
6605 #ifdef CONFIG_SMALL
6606     for( dir = 0; dir < 2; dir++ )
6607         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6608 #else
6609     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6610     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6611 #endif
6612 }
6613
6614 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6615     H264Context *h = *(void**)arg;
6616     MpegEncContext * const s = &h->s;
6617     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6618
6619     s->mb_skip_run= -1;
6620
6621     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6622                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6623
6624     if( h->pps.cabac ) {
6625         int i;
6626
6627         /* realign */
6628         align_get_bits( &s->gb );
6629
6630         /* init cabac */
6631         ff_init_cabac_states( &h->cabac);
6632         ff_init_cabac_decoder( &h->cabac,
6633                                s->gb.buffer + get_bits_count(&s->gb)/8,
6634                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6635         /* calculate pre-state */
6636         for( i= 0; i < 460; i++ ) {
6637             int pre;
6638             if( h->slice_type_nos == FF_I_TYPE )
6639                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6640             else
6641                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6642
6643             if( pre <= 63 )
6644                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6645             else
6646                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6647         }
6648
6649         for(;;){
6650 //START_TIMER
6651             int ret = decode_mb_cabac(h);
6652             int eos;
6653 //STOP_TIMER("decode_mb_cabac")
6654
6655             if(ret>=0) hl_decode_mb(h);
6656
6657             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6658                 s->mb_y++;
6659
6660                 if(ret>=0) ret = decode_mb_cabac(h);
6661
6662                 if(ret>=0) hl_decode_mb(h);
6663                 s->mb_y--;
6664             }
6665             eos = get_cabac_terminate( &h->cabac );
6666
6667             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6668                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6669                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6670                 return -1;
6671             }
6672
6673             if( ++s->mb_x >= s->mb_width ) {
6674                 s->mb_x = 0;
6675                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6676                 ++s->mb_y;
6677                 if(FIELD_OR_MBAFF_PICTURE) {
6678                     ++s->mb_y;
6679                 }
6680             }
6681
6682             if( eos || s->mb_y >= s->mb_height ) {
6683                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6684                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6685                 return 0;
6686             }
6687         }
6688
6689     } else {
6690         for(;;){
6691             int ret = decode_mb_cavlc(h);
6692
6693             if(ret>=0) hl_decode_mb(h);
6694
6695             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6696                 s->mb_y++;
6697                 ret = decode_mb_cavlc(h);
6698
6699                 if(ret>=0) hl_decode_mb(h);
6700                 s->mb_y--;
6701             }
6702
6703             if(ret<0){
6704                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6705                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6706
6707                 return -1;
6708             }
6709
6710             if(++s->mb_x >= s->mb_width){
6711                 s->mb_x=0;
6712                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6713                 ++s->mb_y;
6714                 if(FIELD_OR_MBAFF_PICTURE) {
6715                     ++s->mb_y;
6716                 }
6717                 if(s->mb_y >= s->mb_height){
6718                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6719
6720                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6721                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6722
6723                         return 0;
6724                     }else{
6725                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6726
6727                         return -1;
6728                     }
6729                 }
6730             }
6731
6732             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6733                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6734                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6735                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6736
6737                     return 0;
6738                 }else{
6739                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6740
6741                     return -1;
6742                 }
6743             }
6744         }
6745     }
6746
6747 #if 0
6748     for(;s->mb_y < s->mb_height; s->mb_y++){
6749         for(;s->mb_x < s->mb_width; s->mb_x++){
6750             int ret= decode_mb(h);
6751
6752             hl_decode_mb(h);
6753
6754             if(ret<0){
6755                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6756                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6757
6758                 return -1;
6759             }
6760
6761             if(++s->mb_x >= s->mb_width){
6762                 s->mb_x=0;
6763                 if(++s->mb_y >= s->mb_height){
6764                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6765                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6766
6767                         return 0;
6768                     }else{
6769                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6770
6771                         return -1;
6772                     }
6773                 }
6774             }
6775
6776             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6777                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6778                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6779
6780                     return 0;
6781                 }else{
6782                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6783
6784                     return -1;
6785                 }
6786             }
6787         }
6788         s->mb_x=0;
6789         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6790     }
6791 #endif
6792     return -1; //not reached
6793 }
6794
6795 static int decode_picture_timing(H264Context *h){
6796     MpegEncContext * const s = &h->s;
6797     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6798         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6799         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6800     }
6801     if(h->sps.pic_struct_present_flag){
6802         unsigned int i, num_clock_ts;
6803         h->sei_pic_struct = get_bits(&s->gb, 4);
6804
6805         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6806             return -1;
6807
6808         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6809
6810         for (i = 0 ; i < num_clock_ts ; i++){
6811             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6812                 unsigned int full_timestamp_flag;
6813                 skip_bits(&s->gb, 2);                 /* ct_type */
6814                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6815                 skip_bits(&s->gb, 5);                 /* counting_type */
6816                 full_timestamp_flag = get_bits(&s->gb, 1);
6817                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6818                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6819                 skip_bits(&s->gb, 8);                 /* n_frames */
6820                 if(full_timestamp_flag){
6821                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6822                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6823                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6824                 }else{
6825                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6826                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6827                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6828                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6829                             if(get_bits(&s->gb, 1))   /* hours_flag */
6830                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6831                         }
6832                     }
6833                 }
6834                 if(h->sps.time_offset_length > 0)
6835                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6836             }
6837         }
6838     }
6839     return 0;
6840 }
6841
6842 static int decode_unregistered_user_data(H264Context *h, int size){
6843     MpegEncContext * const s = &h->s;
6844     uint8_t user_data[16+256];
6845     int e, build, i;
6846
6847     if(size<16)
6848         return -1;
6849
6850     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6851         user_data[i]= get_bits(&s->gb, 8);
6852     }
6853
6854     user_data[i]= 0;
6855     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6856     if(e==1 && build>=0)
6857         h->x264_build= build;
6858
6859     if(s->avctx->debug & FF_DEBUG_BUGS)
6860         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6861
6862     for(; i<size; i++)
6863         skip_bits(&s->gb, 8);
6864
6865     return 0;
6866 }
6867
6868 static int decode_sei(H264Context *h){
6869     MpegEncContext * const s = &h->s;
6870
6871     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6872         int size, type;
6873
6874         type=0;
6875         do{
6876             type+= show_bits(&s->gb, 8);
6877         }while(get_bits(&s->gb, 8) == 255);
6878
6879         size=0;
6880         do{
6881             size+= show_bits(&s->gb, 8);
6882         }while(get_bits(&s->gb, 8) == 255);
6883
6884         switch(type){
6885         case 1: // Picture timing SEI
6886             if(decode_picture_timing(h) < 0)
6887                 return -1;
6888             break;
6889         case 5:
6890             if(decode_unregistered_user_data(h, size) < 0)
6891                 return -1;
6892             break;
6893         default:
6894             skip_bits(&s->gb, 8*size);
6895         }
6896
6897         //FIXME check bits here
6898         align_get_bits(&s->gb);
6899     }
6900
6901     return 0;
6902 }
6903
6904 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6905     MpegEncContext * const s = &h->s;
6906     int cpb_count, i;
6907     cpb_count = get_ue_golomb(&s->gb) + 1;
6908
6909     if(cpb_count > 32U){
6910         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6911         return -1;
6912     }
6913
6914     get_bits(&s->gb, 4); /* bit_rate_scale */
6915     get_bits(&s->gb, 4); /* cpb_size_scale */
6916     for(i=0; i<cpb_count; i++){
6917         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6918         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6919         get_bits1(&s->gb);     /* cbr_flag */
6920     }
6921     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6922     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6923     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6924     sps->time_offset_length = get_bits(&s->gb, 5);
6925     return 0;
6926 }
6927
6928 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6929     MpegEncContext * const s = &h->s;
6930     int aspect_ratio_info_present_flag;
6931     unsigned int aspect_ratio_idc;
6932
6933     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6934
6935     if( aspect_ratio_info_present_flag ) {
6936         aspect_ratio_idc= get_bits(&s->gb, 8);
6937         if( aspect_ratio_idc == EXTENDED_SAR ) {
6938             sps->sar.num= get_bits(&s->gb, 16);
6939             sps->sar.den= get_bits(&s->gb, 16);
6940         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6941             sps->sar=  pixel_aspect[aspect_ratio_idc];
6942         }else{
6943             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6944             return -1;
6945         }
6946     }else{
6947         sps->sar.num=
6948         sps->sar.den= 0;
6949     }
6950 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6951
6952     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6953         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6954     }
6955
6956     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6957         get_bits(&s->gb, 3);    /* video_format */
6958         get_bits1(&s->gb);      /* video_full_range_flag */
6959         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6960             get_bits(&s->gb, 8); /* colour_primaries */
6961             get_bits(&s->gb, 8); /* transfer_characteristics */
6962             get_bits(&s->gb, 8); /* matrix_coefficients */
6963         }
6964     }
6965
6966     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6967         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6968         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6969     }
6970
6971     sps->timing_info_present_flag = get_bits1(&s->gb);
6972     if(sps->timing_info_present_flag){
6973         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6974         sps->time_scale = get_bits_long(&s->gb, 32);
6975         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6976     }
6977
6978     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6979     if(sps->nal_hrd_parameters_present_flag)
6980         if(decode_hrd_parameters(h, sps) < 0)
6981             return -1;
6982     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6983     if(sps->vcl_hrd_parameters_present_flag)
6984         if(decode_hrd_parameters(h, sps) < 0)
6985             return -1;
6986     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6987         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6988     sps->pic_struct_present_flag = get_bits1(&s->gb);
6989
6990     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6991     if(sps->bitstream_restriction_flag){
6992         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6993         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6994         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6995         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6996         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6997         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6998         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6999
7000         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7001             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7002             return -1;
7003         }
7004     }
7005
7006     return 0;
7007 }
7008
7009 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7010                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7011     MpegEncContext * const s = &h->s;
7012     int i, last = 8, next = 8;
7013     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7014     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7015         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7016     else
7017     for(i=0;i<size;i++){
7018         if(next)
7019             next = (last + get_se_golomb(&s->gb)) & 0xff;
7020         if(!i && !next){ /* matrix not written, we use the preset one */
7021             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7022             break;
7023         }
7024         last = factors[scan[i]] = next ? next : last;
7025     }
7026 }
7027
7028 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7029                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7030     MpegEncContext * const s = &h->s;
7031     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7032     const uint8_t *fallback[4] = {
7033         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7034         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7035         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7036         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7037     };
7038     if(get_bits1(&s->gb)){
7039         sps->scaling_matrix_present |= is_sps;
7040         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7041         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7042         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7043         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7044         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7045         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7046         if(is_sps || pps->transform_8x8_mode){
7047             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7048             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7049         }
7050     }
7051 }
7052
7053 static inline int decode_seq_parameter_set(H264Context *h){
7054     MpegEncContext * const s = &h->s;
7055     int profile_idc, level_idc;
7056     unsigned int sps_id;
7057     int i;
7058     SPS *sps;
7059
7060     profile_idc= get_bits(&s->gb, 8);
7061     get_bits1(&s->gb);   //constraint_set0_flag
7062     get_bits1(&s->gb);   //constraint_set1_flag
7063     get_bits1(&s->gb);   //constraint_set2_flag
7064     get_bits1(&s->gb);   //constraint_set3_flag
7065     get_bits(&s->gb, 4); // reserved
7066     level_idc= get_bits(&s->gb, 8);
7067     sps_id= get_ue_golomb(&s->gb);
7068
7069     if(sps_id >= MAX_SPS_COUNT) {
7070         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7071         return -1;
7072     }
7073     sps= av_mallocz(sizeof(SPS));
7074     if(sps == NULL)
7075         return -1;
7076
7077     sps->profile_idc= profile_idc;
7078     sps->level_idc= level_idc;
7079
7080     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7081     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7082     sps->scaling_matrix_present = 0;
7083
7084     if(sps->profile_idc >= 100){ //high profile
7085         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7086         if(sps->chroma_format_idc == 3)
7087             get_bits1(&s->gb);  //residual_color_transform_flag
7088         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7089         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7090         sps->transform_bypass = get_bits1(&s->gb);
7091         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7092     }else{
7093         sps->chroma_format_idc= 1;
7094     }
7095
7096     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7097     sps->poc_type= get_ue_golomb(&s->gb);
7098
7099     if(sps->poc_type == 0){ //FIXME #define
7100         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7101     } else if(sps->poc_type == 1){//FIXME #define
7102         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7103         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7104         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7105         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7106
7107         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7108             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7109             goto fail;
7110         }
7111
7112         for(i=0; i<sps->poc_cycle_length; i++)
7113             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7114     }else if(sps->poc_type != 2){
7115         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7116         goto fail;
7117     }
7118
7119     sps->ref_frame_count= get_ue_golomb(&s->gb);
7120     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7121         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7122         goto fail;
7123     }
7124     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7125     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7126     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7127     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7128        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7129         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7130         goto fail;
7131     }
7132
7133     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7134     if(!sps->frame_mbs_only_flag)
7135         sps->mb_aff= get_bits1(&s->gb);
7136     else
7137         sps->mb_aff= 0;
7138
7139     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7140
7141 #ifndef ALLOW_INTERLACE
7142     if(sps->mb_aff)
7143         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7144 #endif
7145     sps->crop= get_bits1(&s->gb);
7146     if(sps->crop){
7147         sps->crop_left  = get_ue_golomb(&s->gb);
7148         sps->crop_right = get_ue_golomb(&s->gb);
7149         sps->crop_top   = get_ue_golomb(&s->gb);
7150         sps->crop_bottom= get_ue_golomb(&s->gb);
7151         if(sps->crop_left || sps->crop_top){
7152             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7153         }
7154         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7155             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7156         }
7157     }else{
7158         sps->crop_left  =
7159         sps->crop_right =
7160         sps->crop_top   =
7161         sps->crop_bottom= 0;
7162     }
7163
7164     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7165     if( sps->vui_parameters_present_flag )
7166         decode_vui_parameters(h, sps);
7167
7168     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7169         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7170                sps_id, sps->profile_idc, sps->level_idc,
7171                sps->poc_type,
7172                sps->ref_frame_count,
7173                sps->mb_width, sps->mb_height,
7174                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7175                sps->direct_8x8_inference_flag ? "8B8" : "",
7176                sps->crop_left, sps->crop_right,
7177                sps->crop_top, sps->crop_bottom,
7178                sps->vui_parameters_present_flag ? "VUI" : "",
7179                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7180                );
7181     }
7182     av_free(h->sps_buffers[sps_id]);
7183     h->sps_buffers[sps_id]= sps;
7184     return 0;
7185 fail:
7186     av_free(sps);
7187     return -1;
7188 }
7189
7190 static void
7191 build_qp_table(PPS *pps, int t, int index)
7192 {
7193     int i;
7194     for(i = 0; i < 52; i++)
7195         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7196 }
7197
7198 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7199     MpegEncContext * const s = &h->s;
7200     unsigned int pps_id= get_ue_golomb(&s->gb);
7201     PPS *pps;
7202
7203     if(pps_id >= MAX_PPS_COUNT) {
7204         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7205         return -1;
7206     }
7207
7208     pps= av_mallocz(sizeof(PPS));
7209     if(pps == NULL)
7210         return -1;
7211     pps->sps_id= get_ue_golomb(&s->gb);
7212     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7213         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7214         goto fail;
7215     }
7216
7217     pps->cabac= get_bits1(&s->gb);
7218     pps->pic_order_present= get_bits1(&s->gb);
7219     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7220     if(pps->slice_group_count > 1 ){
7221         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7222         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7223         switch(pps->mb_slice_group_map_type){
7224         case 0:
7225 #if 0
7226 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7227 |    run_length[ i ]                                |1  |ue(v)   |
7228 #endif
7229             break;
7230         case 2:
7231 #if 0
7232 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7233 |{                                                  |   |        |
7234 |    top_left_mb[ i ]                               |1  |ue(v)   |
7235 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7236 |   }                                               |   |        |
7237 #endif
7238             break;
7239         case 3:
7240         case 4:
7241         case 5:
7242 #if 0
7243 |   slice_group_change_direction_flag               |1  |u(1)    |
7244 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7245 #endif
7246             break;
7247         case 6:
7248 #if 0
7249 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7250 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7251 |)                                                  |   |        |
7252 |    slice_group_id[ i ]                            |1  |u(v)    |
7253 #endif
7254             break;
7255         }
7256     }
7257     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7258     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7259     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7260         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7261         goto fail;
7262     }
7263
7264     pps->weighted_pred= get_bits1(&s->gb);
7265     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7266     pps->init_qp= get_se_golomb(&s->gb) + 26;
7267     pps->init_qs= get_se_golomb(&s->gb) + 26;
7268     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7269     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7270     pps->constrained_intra_pred= get_bits1(&s->gb);
7271     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7272
7273     pps->transform_8x8_mode= 0;
7274     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7275     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7276     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7277
7278     if(get_bits_count(&s->gb) < bit_length){
7279         pps->transform_8x8_mode= get_bits1(&s->gb);
7280         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7281         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7282     } else {
7283         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7284     }
7285
7286     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7287     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7288     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7289         h->pps.chroma_qp_diff= 1;
7290
7291     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7292         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7293                pps_id, pps->sps_id,
7294                pps->cabac ? "CABAC" : "CAVLC",
7295                pps->slice_group_count,
7296                pps->ref_count[0], pps->ref_count[1],
7297                pps->weighted_pred ? "weighted" : "",
7298                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7299                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7300                pps->constrained_intra_pred ? "CONSTR" : "",
7301                pps->redundant_pic_cnt_present ? "REDU" : "",
7302                pps->transform_8x8_mode ? "8x8DCT" : ""
7303                );
7304     }
7305
7306     av_free(h->pps_buffers[pps_id]);
7307     h->pps_buffers[pps_id]= pps;
7308     return 0;
7309 fail:
7310     av_free(pps);
7311     return -1;
7312 }
7313
7314 /**
7315  * Call decode_slice() for each context.
7316  *
7317  * @param h h264 master context
7318  * @param context_count number of contexts to execute
7319  */
7320 static void execute_decode_slices(H264Context *h, int context_count){
7321     MpegEncContext * const s = &h->s;
7322     AVCodecContext * const avctx= s->avctx;
7323     H264Context *hx;
7324     int i;
7325
7326     if(context_count == 1) {
7327         decode_slice(avctx, &h);
7328     } else {
7329         for(i = 1; i < context_count; i++) {
7330             hx = h->thread_context[i];
7331             hx->s.error_recognition = avctx->error_recognition;
7332             hx->s.error_count = 0;
7333         }
7334
7335         avctx->execute(avctx, (void *)decode_slice,
7336                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7337
7338         /* pull back stuff from slices to master context */
7339         hx = h->thread_context[context_count - 1];
7340         s->mb_x = hx->s.mb_x;
7341         s->mb_y = hx->s.mb_y;
7342         s->dropable = hx->s.dropable;
7343         s->picture_structure = hx->s.picture_structure;
7344         for(i = 1; i < context_count; i++)
7345             h->s.error_count += h->thread_context[i]->s.error_count;
7346     }
7347 }
7348
7349
7350 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7351     MpegEncContext * const s = &h->s;
7352     AVCodecContext * const avctx= s->avctx;
7353     int buf_index=0;
7354     H264Context *hx; ///< thread context
7355     int context_count = 0;
7356
7357     h->max_contexts = avctx->thread_count;
7358 #if 0
7359     int i;
7360     for(i=0; i<50; i++){
7361         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7362     }
7363 #endif
7364     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7365         h->current_slice = 0;
7366         if (!s->first_field)
7367             s->current_picture_ptr= NULL;
7368     }
7369
7370     for(;;){
7371         int consumed;
7372         int dst_length;
7373         int bit_length;
7374         const uint8_t *ptr;
7375         int i, nalsize = 0;
7376         int err;
7377
7378         if(h->is_avc) {
7379             if(buf_index >= buf_size) break;
7380             nalsize = 0;
7381             for(i = 0; i < h->nal_length_size; i++)
7382                 nalsize = (nalsize << 8) | buf[buf_index++];
7383             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7384                 if(nalsize == 1){
7385                     buf_index++;
7386                     continue;
7387                 }else{
7388                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7389                     break;
7390                 }
7391             }
7392         } else {
7393             // start code prefix search
7394             for(; buf_index + 3 < buf_size; buf_index++){
7395                 // This should always succeed in the first iteration.
7396                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7397                     break;
7398             }
7399
7400             if(buf_index+3 >= buf_size) break;
7401
7402             buf_index+=3;
7403         }
7404
7405         hx = h->thread_context[context_count];
7406
7407         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7408         if (ptr==NULL || dst_length < 0){
7409             return -1;
7410         }
7411         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7412             dst_length--;
7413         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7414
7415         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7416             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7417         }
7418
7419         if (h->is_avc && (nalsize != consumed)){
7420             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7421             consumed= nalsize;
7422         }
7423
7424         buf_index += consumed;
7425
7426         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7427            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7428             continue;
7429
7430       again:
7431         err = 0;
7432         switch(hx->nal_unit_type){
7433         case NAL_IDR_SLICE:
7434             if (h->nal_unit_type != NAL_IDR_SLICE) {
7435                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7436                 return -1;
7437             }
7438             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7439         case NAL_SLICE:
7440             init_get_bits(&hx->s.gb, ptr, bit_length);
7441             hx->intra_gb_ptr=
7442             hx->inter_gb_ptr= &hx->s.gb;
7443             hx->s.data_partitioning = 0;
7444
7445             if((err = decode_slice_header(hx, h)))
7446                break;
7447
7448             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7449             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7450                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7451                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7452                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7453                && avctx->skip_frame < AVDISCARD_ALL)
7454                 context_count++;
7455             break;
7456         case NAL_DPA:
7457             init_get_bits(&hx->s.gb, ptr, bit_length);
7458             hx->intra_gb_ptr=
7459             hx->inter_gb_ptr= NULL;
7460             hx->s.data_partitioning = 1;
7461
7462             err = decode_slice_header(hx, h);
7463             break;
7464         case NAL_DPB:
7465             init_get_bits(&hx->intra_gb, ptr, bit_length);
7466             hx->intra_gb_ptr= &hx->intra_gb;
7467             break;
7468         case NAL_DPC:
7469             init_get_bits(&hx->inter_gb, ptr, bit_length);
7470             hx->inter_gb_ptr= &hx->inter_gb;
7471
7472             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7473                && s->context_initialized
7474                && s->hurry_up < 5
7475                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7476                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7477                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7478                && avctx->skip_frame < AVDISCARD_ALL)
7479                 context_count++;
7480             break;
7481         case NAL_SEI:
7482             init_get_bits(&s->gb, ptr, bit_length);
7483             decode_sei(h);
7484             break;
7485         case NAL_SPS:
7486             init_get_bits(&s->gb, ptr, bit_length);
7487             decode_seq_parameter_set(h);
7488
7489             if(s->flags& CODEC_FLAG_LOW_DELAY)
7490                 s->low_delay=1;
7491
7492             if(avctx->has_b_frames < 2)
7493                 avctx->has_b_frames= !s->low_delay;
7494             break;
7495         case NAL_PPS:
7496             init_get_bits(&s->gb, ptr, bit_length);
7497
7498             decode_picture_parameter_set(h, bit_length);
7499
7500             break;
7501         case NAL_AUD:
7502         case NAL_END_SEQUENCE:
7503         case NAL_END_STREAM:
7504         case NAL_FILLER_DATA:
7505         case NAL_SPS_EXT:
7506         case NAL_AUXILIARY_SLICE:
7507             break;
7508         default:
7509             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7510         }
7511
7512         if(context_count == h->max_contexts) {
7513             execute_decode_slices(h, context_count);
7514             context_count = 0;
7515         }
7516
7517         if (err < 0)
7518             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7519         else if(err == 1) {
7520             /* Slice could not be decoded in parallel mode, copy down
7521              * NAL unit stuff to context 0 and restart. Note that
7522              * rbsp_buffer is not transferred, but since we no longer
7523              * run in parallel mode this should not be an issue. */
7524             h->nal_unit_type = hx->nal_unit_type;
7525             h->nal_ref_idc   = hx->nal_ref_idc;
7526             hx = h;
7527             goto again;
7528         }
7529     }
7530     if(context_count)
7531         execute_decode_slices(h, context_count);
7532     return buf_index;
7533 }
7534
7535 /**
7536  * returns the number of bytes consumed for building the current frame
7537  */
7538 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7539         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7540         if(pos+10>buf_size) pos=buf_size; // oops ;)
7541
7542         return pos;
7543 }
7544
7545 static int decode_frame(AVCodecContext *avctx,
7546                              void *data, int *data_size,
7547                              const uint8_t *buf, int buf_size)
7548 {
7549     H264Context *h = avctx->priv_data;
7550     MpegEncContext *s = &h->s;
7551     AVFrame *pict = data;
7552     int buf_index;
7553
7554     s->flags= avctx->flags;
7555     s->flags2= avctx->flags2;
7556
7557    /* end of stream, output what is still in the buffers */
7558     if (buf_size == 0) {
7559         Picture *out;
7560         int i, out_idx;
7561
7562 //FIXME factorize this with the output code below
7563         out = h->delayed_pic[0];
7564         out_idx = 0;
7565         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7566             if(h->delayed_pic[i]->poc < out->poc){
7567                 out = h->delayed_pic[i];
7568                 out_idx = i;
7569             }
7570
7571         for(i=out_idx; h->delayed_pic[i]; i++)
7572             h->delayed_pic[i] = h->delayed_pic[i+1];
7573
7574         if(out){
7575             *data_size = sizeof(AVFrame);
7576             *pict= *(AVFrame*)out;
7577         }
7578
7579         return 0;
7580     }
7581
7582     if(h->is_avc && !h->got_avcC) {
7583         int i, cnt, nalsize;
7584         unsigned char *p = avctx->extradata;
7585         if(avctx->extradata_size < 7) {
7586             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7587             return -1;
7588         }
7589         if(*p != 1) {
7590             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7591             return -1;
7592         }
7593         /* sps and pps in the avcC always have length coded with 2 bytes,
7594            so put a fake nal_length_size = 2 while parsing them */
7595         h->nal_length_size = 2;
7596         // Decode sps from avcC
7597         cnt = *(p+5) & 0x1f; // Number of sps
7598         p += 6;
7599         for (i = 0; i < cnt; i++) {
7600             nalsize = AV_RB16(p) + 2;
7601             if(decode_nal_units(h, p, nalsize) < 0) {
7602                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7603                 return -1;
7604             }
7605             p += nalsize;
7606         }
7607         // Decode pps from avcC
7608         cnt = *(p++); // Number of pps
7609         for (i = 0; i < cnt; i++) {
7610             nalsize = AV_RB16(p) + 2;
7611             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7612                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7613                 return -1;
7614             }
7615             p += nalsize;
7616         }
7617         // Now store right nal length size, that will be use to parse all other nals
7618         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7619         // Do not reparse avcC
7620         h->got_avcC = 1;
7621     }
7622
7623     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7624         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7625             return -1;
7626         h->got_avcC = 1;
7627     }
7628
7629     buf_index=decode_nal_units(h, buf, buf_size);
7630     if(buf_index < 0)
7631         return -1;
7632
7633     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7634         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7635         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7636         return -1;
7637     }
7638
7639     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7640         Picture *out = s->current_picture_ptr;
7641         Picture *cur = s->current_picture_ptr;
7642         int i, pics, cross_idr, out_of_order, out_idx;
7643
7644         s->mb_y= 0;
7645
7646         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7647         s->current_picture_ptr->pict_type= s->pict_type;
7648
7649         if(!s->dropable) {
7650             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7651             h->prev_poc_msb= h->poc_msb;
7652             h->prev_poc_lsb= h->poc_lsb;
7653         }
7654         h->prev_frame_num_offset= h->frame_num_offset;
7655         h->prev_frame_num= h->frame_num;
7656
7657         /*
7658          * FIXME: Error handling code does not seem to support interlaced
7659          * when slices span multiple rows
7660          * The ff_er_add_slice calls don't work right for bottom
7661          * fields; they cause massive erroneous error concealing
7662          * Error marking covers both fields (top and bottom).
7663          * This causes a mismatched s->error_count
7664          * and a bad error table. Further, the error count goes to
7665          * INT_MAX when called for bottom field, because mb_y is
7666          * past end by one (callers fault) and resync_mb_y != 0
7667          * causes problems for the first MB line, too.
7668          */
7669         if (!FIELD_PICTURE)
7670             ff_er_frame_end(s);
7671
7672         MPV_frame_end(s);
7673
7674         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7675             /* Wait for second field. */
7676             *data_size = 0;
7677
7678         } else {
7679             cur->repeat_pict = 0;
7680
7681             /* Signal interlacing information externally. */
7682             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7683             if(h->sps.pic_struct_present_flag){
7684                 switch (h->sei_pic_struct)
7685                 {
7686                 case SEI_PIC_STRUCT_FRAME:
7687                     cur->interlaced_frame = 0;
7688                     break;
7689                 case SEI_PIC_STRUCT_TOP_FIELD:
7690                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7691                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7692                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7693                     cur->interlaced_frame = 1;
7694                     break;
7695                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7696                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7697                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7698                     // From these hints, let the applications decide if they apply deinterlacing.
7699                     cur->repeat_pict = 1;
7700                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7701                     break;
7702                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7703                     // Force progressive here, as doubling interlaced frame is a bad idea.
7704                     cur->interlaced_frame = 0;
7705                     cur->repeat_pict = 2;
7706                     break;
7707                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7708                     cur->interlaced_frame = 0;
7709                     cur->repeat_pict = 4;
7710                     break;
7711                 }
7712             }else{
7713                 /* Derive interlacing flag from used decoding process. */
7714                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7715             }
7716
7717             if (cur->field_poc[0] != cur->field_poc[1]){
7718                 /* Derive top_field_first from field pocs. */
7719                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7720             }else{
7721                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7722                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7723                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7724                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7725                         cur->top_field_first = 1;
7726                     else
7727                         cur->top_field_first = 0;
7728                 }else{
7729                     /* Most likely progressive */
7730                     cur->top_field_first = 0;
7731                 }
7732             }
7733
7734         //FIXME do something with unavailable reference frames
7735
7736             /* Sort B-frames into display order */
7737
7738             if(h->sps.bitstream_restriction_flag
7739                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7740                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7741                 s->low_delay = 0;
7742             }
7743
7744             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7745                && !h->sps.bitstream_restriction_flag){
7746                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7747                 s->low_delay= 0;
7748             }
7749
7750             pics = 0;
7751             while(h->delayed_pic[pics]) pics++;
7752
7753             assert(pics <= MAX_DELAYED_PIC_COUNT);
7754
7755             h->delayed_pic[pics++] = cur;
7756             if(cur->reference == 0)
7757                 cur->reference = DELAYED_PIC_REF;
7758
7759             out = h->delayed_pic[0];
7760             out_idx = 0;
7761             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7762                 if(h->delayed_pic[i]->poc < out->poc){
7763                     out = h->delayed_pic[i];
7764                     out_idx = i;
7765                 }
7766             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7767
7768             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7769
7770             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7771                 { }
7772             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7773                || (s->low_delay &&
7774                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7775                  || cur->pict_type == FF_B_TYPE)))
7776             {
7777                 s->low_delay = 0;
7778                 s->avctx->has_b_frames++;
7779             }
7780
7781             if(out_of_order || pics > s->avctx->has_b_frames){
7782                 out->reference &= ~DELAYED_PIC_REF;
7783                 for(i=out_idx; h->delayed_pic[i]; i++)
7784                     h->delayed_pic[i] = h->delayed_pic[i+1];
7785             }
7786             if(!out_of_order && pics > s->avctx->has_b_frames){
7787                 *data_size = sizeof(AVFrame);
7788
7789                 h->outputed_poc = out->poc;
7790                 *pict= *(AVFrame*)out;
7791             }else{
7792                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7793             }
7794         }
7795     }
7796
7797     assert(pict->data[0] || !*data_size);
7798     ff_print_debug_info(s, pict);
7799 //printf("out %d\n", (int)pict->data[0]);
7800 #if 0 //?
7801
7802     /* Return the Picture timestamp as the frame number */
7803     /* we subtract 1 because it is added on utils.c     */
7804     avctx->frame_number = s->picture_number - 1;
7805 #endif
7806     return get_consumed_bytes(s, buf_index, buf_size);
7807 }
7808 #if 0
7809 static inline void fill_mb_avail(H264Context *h){
7810     MpegEncContext * const s = &h->s;
7811     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7812
7813     if(s->mb_y){
7814         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7815         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7816         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7817     }else{
7818         h->mb_avail[0]=
7819         h->mb_avail[1]=
7820         h->mb_avail[2]= 0;
7821     }
7822     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7823     h->mb_avail[4]= 1; //FIXME move out
7824     h->mb_avail[5]= 0; //FIXME move out
7825 }
7826 #endif
7827
7828 #ifdef TEST
7829 #undef printf
7830 #undef random
7831 #define COUNT 8000
7832 #define SIZE (COUNT*40)
7833 int main(void){
7834     int i;
7835     uint8_t temp[SIZE];
7836     PutBitContext pb;
7837     GetBitContext gb;
7838 //    int int_temp[10000];
7839     DSPContext dsp;
7840     AVCodecContext avctx;
7841
7842     dsputil_init(&dsp, &avctx);
7843
7844     init_put_bits(&pb, temp, SIZE);
7845     printf("testing unsigned exp golomb\n");
7846     for(i=0; i<COUNT; i++){
7847         START_TIMER
7848         set_ue_golomb(&pb, i);
7849         STOP_TIMER("set_ue_golomb");
7850     }
7851     flush_put_bits(&pb);
7852
7853     init_get_bits(&gb, temp, 8*SIZE);
7854     for(i=0; i<COUNT; i++){
7855         int j, s;
7856
7857         s= show_bits(&gb, 24);
7858
7859         START_TIMER
7860         j= get_ue_golomb(&gb);
7861         if(j != i){
7862             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7863 //            return -1;
7864         }
7865         STOP_TIMER("get_ue_golomb");
7866     }
7867
7868
7869     init_put_bits(&pb, temp, SIZE);
7870     printf("testing signed exp golomb\n");
7871     for(i=0; i<COUNT; i++){
7872         START_TIMER
7873         set_se_golomb(&pb, i - COUNT/2);
7874         STOP_TIMER("set_se_golomb");
7875     }
7876     flush_put_bits(&pb);
7877
7878     init_get_bits(&gb, temp, 8*SIZE);
7879     for(i=0; i<COUNT; i++){
7880         int j, s;
7881
7882         s= show_bits(&gb, 24);
7883
7884         START_TIMER
7885         j= get_se_golomb(&gb);
7886         if(j != i - COUNT/2){
7887             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7888 //            return -1;
7889         }
7890         STOP_TIMER("get_se_golomb");
7891     }
7892
7893 #if 0
7894     printf("testing 4x4 (I)DCT\n");
7895
7896     DCTELEM block[16];
7897     uint8_t src[16], ref[16];
7898     uint64_t error= 0, max_error=0;
7899
7900     for(i=0; i<COUNT; i++){
7901         int j;
7902 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7903         for(j=0; j<16; j++){
7904             ref[j]= random()%255;
7905             src[j]= random()%255;
7906         }
7907
7908         h264_diff_dct_c(block, src, ref, 4);
7909
7910         //normalize
7911         for(j=0; j<16; j++){
7912 //            printf("%d ", block[j]);
7913             block[j]= block[j]*4;
7914             if(j&1) block[j]= (block[j]*4 + 2)/5;
7915             if(j&4) block[j]= (block[j]*4 + 2)/5;
7916         }
7917 //        printf("\n");
7918
7919         s->dsp.h264_idct_add(ref, block, 4);
7920 /*        for(j=0; j<16; j++){
7921             printf("%d ", ref[j]);
7922         }
7923         printf("\n");*/
7924
7925         for(j=0; j<16; j++){
7926             int diff= FFABS(src[j] - ref[j]);
7927
7928             error+= diff*diff;
7929             max_error= FFMAX(max_error, diff);
7930         }
7931     }
7932     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7933     printf("testing quantizer\n");
7934     for(qp=0; qp<52; qp++){
7935         for(i=0; i<16; i++)
7936             src1_block[i]= src2_block[i]= random()%255;
7937
7938     }
7939     printf("Testing NAL layer\n");
7940
7941     uint8_t bitstream[COUNT];
7942     uint8_t nal[COUNT*2];
7943     H264Context h;
7944     memset(&h, 0, sizeof(H264Context));
7945
7946     for(i=0; i<COUNT; i++){
7947         int zeros= i;
7948         int nal_length;
7949         int consumed;
7950         int out_length;
7951         uint8_t *out;
7952         int j;
7953
7954         for(j=0; j<COUNT; j++){
7955             bitstream[j]= (random() % 255) + 1;
7956         }
7957
7958         for(j=0; j<zeros; j++){
7959             int pos= random() % COUNT;
7960             while(bitstream[pos] == 0){
7961                 pos++;
7962                 pos %= COUNT;
7963             }
7964             bitstream[pos]=0;
7965         }
7966
7967         START_TIMER
7968
7969         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7970         if(nal_length<0){
7971             printf("encoding failed\n");
7972             return -1;
7973         }
7974
7975         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7976
7977         STOP_TIMER("NAL")
7978
7979         if(out_length != COUNT){
7980             printf("incorrect length %d %d\n", out_length, COUNT);
7981             return -1;
7982         }
7983
7984         if(consumed != nal_length){
7985             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7986             return -1;
7987         }
7988
7989         if(memcmp(bitstream, out, COUNT)){
7990             printf("mismatch\n");
7991             return -1;
7992         }
7993     }
7994 #endif
7995
7996     printf("Testing RBSP\n");
7997
7998
7999     return 0;
8000 }
8001 #endif /* TEST */
8002
8003
8004 static av_cold int decode_end(AVCodecContext *avctx)
8005 {
8006     H264Context *h = avctx->priv_data;
8007     MpegEncContext *s = &h->s;
8008     int i;
8009
8010     av_freep(&h->rbsp_buffer[0]);
8011     av_freep(&h->rbsp_buffer[1]);
8012     free_tables(h); //FIXME cleanup init stuff perhaps
8013
8014     for(i = 0; i < MAX_SPS_COUNT; i++)
8015         av_freep(h->sps_buffers + i);
8016
8017     for(i = 0; i < MAX_PPS_COUNT; i++)
8018         av_freep(h->pps_buffers + i);
8019
8020     MPV_common_end(s);
8021
8022 //    memset(h, 0, sizeof(H264Context));
8023
8024     return 0;
8025 }
8026
8027
8028 AVCodec h264_decoder = {
8029     "h264",
8030     CODEC_TYPE_VIDEO,
8031     CODEC_ID_H264,
8032     sizeof(H264Context),
8033     decode_init,
8034     NULL,
8035     decode_end,
8036     decode_frame,
8037     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8038     .flush= flush_dpb,
8039     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8040 };
8041
8042 #include "svq3.c"