git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         int offset;
1958         done = 1;
1959
1960         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1961         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1962         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1963                  &chroma_dc_coeff_token_len [0], 1, 1,
1964                  &chroma_dc_coeff_token_bits[0], 1, 1,
1965                  INIT_VLC_USE_NEW_STATIC);
1966
1967         offset = 0;
1968         for(i=0; i<4; i++){
1969             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1970             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1971             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1972                      &coeff_token_len [i][0], 1, 1,
1973                      &coeff_token_bits[i][0], 1, 1,
1974                      INIT_VLC_USE_NEW_STATIC);
1975             offset += coeff_token_vlc_tables_size[i];
1976         }
1977         /*
1978          * This is a one time safety check to make sure that
1979          * the packed static coeff_token_vlc table sizes
1980          * were initialized correctly.
1981          */
1982         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1983
1984         for(i=0; i<3; i++){
1985             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1986             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1987             init_vlc(&chroma_dc_total_zeros_vlc[i],
1988                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1989                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1990                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1991                      INIT_VLC_USE_NEW_STATIC);
1992         }
1993         for(i=0; i<15; i++){
1994             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1995             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1996             init_vlc(&total_zeros_vlc[i],
1997                      TOTAL_ZEROS_VLC_BITS, 16,
1998                      &total_zeros_len [i][0], 1, 1,
1999                      &total_zeros_bits[i][0], 1, 1,
2000                      INIT_VLC_USE_NEW_STATIC);
2001         }
2002
2003         for(i=0; i<6; i++){
2004             run_vlc[i].table = run_vlc_tables[i];
2005             run_vlc[i].table_allocated = run_vlc_tables_size;
2006             init_vlc(&run_vlc[i],
2007                      RUN_VLC_BITS, 7,
2008                      &run_len [i][0], 1, 1,
2009                      &run_bits[i][0], 1, 1,
2010                      INIT_VLC_USE_NEW_STATIC);
2011         }
2012         run7_vlc.table = run7_vlc_table,
2013         run7_vlc.table_allocated = run7_vlc_table_size;
2014         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2015                  &run_len [6][0], 1, 1,
2016                  &run_bits[6][0], 1, 1,
2017                  INIT_VLC_USE_NEW_STATIC);
2018     }
2019 }
2020
2021 static void free_tables(H264Context *h){
2022     int i;
2023     H264Context *hx;
2024     av_freep(&h->intra4x4_pred_mode);
2025     av_freep(&h->chroma_pred_mode_table);
2026     av_freep(&h->cbp_table);
2027     av_freep(&h->mvd_table[0]);
2028     av_freep(&h->mvd_table[1]);
2029     av_freep(&h->direct_table);
2030     av_freep(&h->non_zero_count);
2031     av_freep(&h->slice_table_base);
2032     h->slice_table= NULL;
2033
2034     av_freep(&h->mb2b_xy);
2035     av_freep(&h->mb2b8_xy);
2036
2037     for(i = 0; i < h->s.avctx->thread_count; i++) {
2038         hx = h->thread_context[i];
2039         if(!hx) continue;
2040         av_freep(&hx->top_borders[1]);
2041         av_freep(&hx->top_borders[0]);
2042         av_freep(&hx->s.obmc_scratchpad);
2043     }
2044 }
2045
2046 static void init_dequant8_coeff_table(H264Context *h){
2047     int i,q,x;
2048     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2049     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2050     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2051
2052     for(i=0; i<2; i++ ){
2053         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2054             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2055             break;
2056         }
2057
2058         for(q=0; q<52; q++){
2059             int shift = div6[q];
2060             int idx = rem6[q];
2061             for(x=0; x<64; x++)
2062                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2063                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2064                     h->pps.scaling_matrix8[i][x]) << shift;
2065         }
2066     }
2067 }
2068
2069 static void init_dequant4_coeff_table(H264Context *h){
2070     int i,j,q,x;
2071     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2072     for(i=0; i<6; i++ ){
2073         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2074         for(j=0; j<i; j++){
2075             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2076                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2077                 break;
2078             }
2079         }
2080         if(j<i)
2081             continue;
2082
2083         for(q=0; q<52; q++){
2084             int shift = div6[q] + 2;
2085             int idx = rem6[q];
2086             for(x=0; x<16; x++)
2087                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2088                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2089                     h->pps.scaling_matrix4[i][x]) << shift;
2090         }
2091     }
2092 }
2093
2094 static void init_dequant_tables(H264Context *h){
2095     int i,x;
2096     init_dequant4_coeff_table(h);
2097     if(h->pps.transform_8x8_mode)
2098         init_dequant8_coeff_table(h);
2099     if(h->sps.transform_bypass){
2100         for(i=0; i<6; i++)
2101             for(x=0; x<16; x++)
2102                 h->dequant4_coeff[i][0][x] = 1<<6;
2103         if(h->pps.transform_8x8_mode)
2104             for(i=0; i<2; i++)
2105                 for(x=0; x<64; x++)
2106                     h->dequant8_coeff[i][0][x] = 1<<6;
2107     }
2108 }
2109
2110
2111 /**
2112  * allocates tables.
2113  * needs width/height
2114  */
2115 static int alloc_tables(H264Context *h){
2116     MpegEncContext * const s = &h->s;
2117     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2118     int x,y;
2119
2120     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2121
2122     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2123     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2124     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2125
2126     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2127     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2128     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2129     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2130
2131     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2132     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146
2147     s->obmc_scratchpad = NULL;
2148
2149     if(!h->dequant4_coeff[0])
2150         init_dequant_tables(h);
2151
2152     return 0;
2153 fail:
2154     free_tables(h);
2155     return -1;
2156 }
2157
2158 /**
2159  * Mimic alloc_tables(), but for every context thread.
2160  */
2161 static void clone_tables(H264Context *dst, H264Context *src){
2162     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2163     dst->non_zero_count           = src->non_zero_count;
2164     dst->slice_table              = src->slice_table;
2165     dst->cbp_table                = src->cbp_table;
2166     dst->mb2b_xy                  = src->mb2b_xy;
2167     dst->mb2b8_xy                 = src->mb2b8_xy;
2168     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2169     dst->mvd_table[0]             = src->mvd_table[0];
2170     dst->mvd_table[1]             = src->mvd_table[1];
2171     dst->direct_table             = src->direct_table;
2172
2173     dst->s.obmc_scratchpad = NULL;
2174     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2175 }
2176
2177 /**
2178  * Init context
2179  * Allocate buffers which are not shared amongst multiple threads.
2180  */
2181 static int context_init(H264Context *h){
2182     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2183     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2184
2185     return 0;
2186 fail:
2187     return -1; // free_tables will clean up for us
2188 }
2189
2190 static av_cold void common_init(H264Context *h){
2191     MpegEncContext * const s = &h->s;
2192
2193     s->width = s->avctx->width;
2194     s->height = s->avctx->height;
2195     s->codec_id= s->avctx->codec->id;
2196
2197     ff_h264_pred_init(&h->hpc, s->codec_id);
2198
2199     h->dequant_coeff_pps= -1;
2200     s->unrestricted_mv=1;
2201     s->decode=1; //FIXME
2202
2203     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2204     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2205 }
2206
2207 static av_cold int decode_init(AVCodecContext *avctx){
2208     H264Context *h= avctx->priv_data;
2209     MpegEncContext * const s = &h->s;
2210
2211     MPV_decode_defaults(s);
2212
2213     s->avctx = avctx;
2214     common_init(h);
2215
2216     s->out_format = FMT_H264;
2217     s->workaround_bugs= avctx->workaround_bugs;
2218
2219     // set defaults
2220 //    s->decode_mb= ff_h263_decode_mb;
2221     s->quarter_sample = 1;
2222     s->low_delay= 1;
2223
2224     if(avctx->codec_id == CODEC_ID_SVQ3)
2225         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2226     else
2227         avctx->pix_fmt= PIX_FMT_YUV420P;
2228
2229     decode_init_vlc();
2230
2231     if(avctx->extradata_size > 0 && avctx->extradata &&
2232        *(char *)avctx->extradata == 1){
2233         h->is_avc = 1;
2234         h->got_avcC = 0;
2235     } else {
2236         h->is_avc = 0;
2237     }
2238
2239     h->thread_context[0] = h;
2240     h->outputed_poc = INT_MIN;
2241     h->prev_poc_msb= 1<<16;
2242     return 0;
2243 }
2244
2245 static int frame_start(H264Context *h){
2246     MpegEncContext * const s = &h->s;
2247     int i;
2248
2249     if(MPV_frame_start(s, s->avctx) < 0)
2250         return -1;
2251     ff_er_frame_start(s);
2252     /*
2253      * MPV_frame_start uses pict_type to derive key_frame.
2254      * This is incorrect for H.264; IDR markings must be used.
2255      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2256      * See decode_nal_units().
2257      */
2258     s->current_picture_ptr->key_frame= 0;
2259
2260     assert(s->linesize && s->uvlinesize);
2261
2262     for(i=0; i<16; i++){
2263         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2264         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2265     }
2266     for(i=0; i<4; i++){
2267         h->block_offset[16+i]=
2268         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2269         h->block_offset[24+16+i]=
2270         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2271     }
2272
2273     /* can't be in alloc_tables because linesize isn't known there.
2274      * FIXME: redo bipred weight to not require extra buffer? */
2275     for(i = 0; i < s->avctx->thread_count; i++)
2276         if(!h->thread_context[i]->s.obmc_scratchpad)
2277             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2278
2279     /* some macroblocks will be accessed before they're available */
2280     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2281         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2282
2283 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2284
2285     // We mark the current picture as non-reference after allocating it, so
2286     // that if we break out due to an error it can be released automatically
2287     // in the next MPV_frame_start().
2288     // SVQ3 as well as most other codecs have only last/next/current and thus
2289     // get released even with set reference, besides SVQ3 and others do not
2290     // mark frames as reference later "naturally".
2291     if(s->codec_id != CODEC_ID_SVQ3)
2292         s->current_picture_ptr->reference= 0;
2293
2294     s->current_picture_ptr->field_poc[0]=
2295     s->current_picture_ptr->field_poc[1]= INT_MAX;
2296     assert(s->current_picture_ptr->long_ref==0);
2297
2298     return 0;
2299 }
2300
2301 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2302     MpegEncContext * const s = &h->s;
2303     int i;
2304     int step    = 1;
2305     int offset  = 1;
2306     int uvoffset= 1;
2307     int top_idx = 1;
2308     int skiplast= 0;
2309
2310     src_y  -=   linesize;
2311     src_cb -= uvlinesize;
2312     src_cr -= uvlinesize;
2313
2314     if(!simple && FRAME_MBAFF){
2315         if(s->mb_y&1){
2316             offset  = MB_MBAFF ? 1 : 17;
2317             uvoffset= MB_MBAFF ? 1 : 9;
2318             if(!MB_MBAFF){
2319                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2320                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2321                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2322                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2323                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2324                 }
2325             }
2326         }else{
2327             if(!MB_MBAFF){
2328                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2329                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2330                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2331                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2332                 }
2333                 skiplast= 1;
2334             }
2335             offset  =
2336             uvoffset=
2337             top_idx = MB_MBAFF ? 0 : 1;
2338         }
2339         step= MB_MBAFF ? 2 : 1;
2340     }
2341
2342     // There are two lines saved, the line above the the top macroblock of a pair,
2343     // and the line above the bottom macroblock
2344     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2345     for(i=1; i<17 - skiplast; i++){
2346         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2347     }
2348
2349     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2350     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2351
2352     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2353         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2354         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2355         for(i=1; i<9 - skiplast; i++){
2356             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2357             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2358         }
2359         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2360         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2361     }
2362 }
2363
2364 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2365     MpegEncContext * const s = &h->s;
2366     int temp8, i;
2367     uint64_t temp64;
2368     int deblock_left;
2369     int deblock_top;
2370     int mb_xy;
2371     int step    = 1;
2372     int offset  = 1;
2373     int uvoffset= 1;
2374     int top_idx = 1;
2375
2376     if(!simple && FRAME_MBAFF){
2377         if(s->mb_y&1){
2378             offset  = MB_MBAFF ? 1 : 17;
2379             uvoffset= MB_MBAFF ? 1 : 9;
2380         }else{
2381             offset  =
2382             uvoffset=
2383             top_idx = MB_MBAFF ? 0 : 1;
2384         }
2385         step= MB_MBAFF ? 2 : 1;
2386     }
2387
2388     if(h->deblocking_filter == 2) {
2389         mb_xy = h->mb_xy;
2390         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2391         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2392     } else {
2393         deblock_left = (s->mb_x > 0);
2394         deblock_top =  (s->mb_y > !!MB_FIELD);
2395     }
2396
2397     src_y  -=   linesize + 1;
2398     src_cb -= uvlinesize + 1;
2399     src_cr -= uvlinesize + 1;
2400
2401 #define XCHG(a,b,t,xchg)\
2402 t= a;\
2403 if(xchg)\
2404     a= b;\
2405 b= t;
2406
2407     if(deblock_left){
2408         for(i = !deblock_top; i<16; i++){
2409             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2410         }
2411         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2412     }
2413
2414     if(deblock_top){
2415         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2416         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2417         if(s->mb_x+1 < s->mb_width){
2418             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2419         }
2420     }
2421
2422     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2423         if(deblock_left){
2424             for(i = !deblock_top; i<8; i++){
2425                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2426                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2427             }
2428             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2429             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2430         }
2431         if(deblock_top){
2432             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2433             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2434         }
2435     }
2436 }
2437
2438 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2439     MpegEncContext * const s = &h->s;
2440     const int mb_x= s->mb_x;
2441     const int mb_y= s->mb_y;
2442     const int mb_xy= h->mb_xy;
2443     const int mb_type= s->current_picture.mb_type[mb_xy];
2444     uint8_t  *dest_y, *dest_cb, *dest_cr;
2445     int linesize, uvlinesize /*dct_offset*/;
2446     int i;
2447     int *block_offset = &h->block_offset[0];
2448     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2449     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2450     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2451
2452     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2453     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2454     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2455
2456     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2457     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2458
2459     if (!simple && MB_FIELD) {
2460         linesize   = h->mb_linesize   = s->linesize * 2;
2461         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2462         block_offset = &h->block_offset[24];
2463         if(mb_y&1){ //FIXME move out of this function?
2464             dest_y -= s->linesize*15;
2465             dest_cb-= s->uvlinesize*7;
2466             dest_cr-= s->uvlinesize*7;
2467         }
2468         if(FRAME_MBAFF) {
2469             int list;
2470             for(list=0; list<h->list_count; list++){
2471                 if(!USES_LIST(mb_type, list))
2472                     continue;
2473                 if(IS_16X16(mb_type)){
2474                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2475                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2476                 }else{
2477                     for(i=0; i<16; i+=4){
2478                         int ref = h->ref_cache[list][scan8[i]];
2479                         if(ref >= 0)
2480                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2481                     }
2482                 }
2483             }
2484         }
2485     } else {
2486         linesize   = h->mb_linesize   = s->linesize;
2487         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2488 //        dct_offset = s->linesize * 16;
2489     }
2490
2491     if(transform_bypass){
2492         idct_dc_add =
2493         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2494     }else if(IS_8x8DCT(mb_type)){
2495         idct_dc_add = s->dsp.h264_idct8_dc_add;
2496         idct_add = s->dsp.h264_idct8_add;
2497     }else{
2498         idct_dc_add = s->dsp.h264_idct_dc_add;
2499         idct_add = s->dsp.h264_idct_add;
2500     }
2501
2502     if (!simple && IS_INTRA_PCM(mb_type)) {
2503         for (i=0; i<16; i++) {
2504             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2505         }
2506         for (i=0; i<8; i++) {
2507             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2508             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2509         }
2510     } else {
2511         if(IS_INTRA(mb_type)){
2512             if(h->deblocking_filter)
2513                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2514
2515             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2516                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2517                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2518             }
2519
2520             if(IS_INTRA4x4(mb_type)){
2521                 if(simple || !s->encoding){
2522                     if(IS_8x8DCT(mb_type)){
2523                         for(i=0; i<16; i+=4){
2524                             uint8_t * const ptr= dest_y + block_offset[i];
2525                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2526                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2527                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2528                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2529                             if(nnz){
2530                                 if(nnz == 1 && h->mb[i*16])
2531                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2532                                 else
2533                                     idct_add(ptr, h->mb + i*16, linesize);
2534                             }
2535                         }
2536                     }else
2537                     for(i=0; i<16; i++){
2538                         uint8_t * const ptr= dest_y + block_offset[i];
2539                         uint8_t *topright;
2540                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2541                         int nnz, tr;
2542
2543                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2544                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2545                             assert(mb_y || linesize <= block_offset[i]);
2546                             if(!topright_avail){
2547                                 tr= ptr[3 - linesize]*0x01010101;
2548                                 topright= (uint8_t*) &tr;
2549                             }else
2550                                 topright= ptr + 4 - linesize;
2551                         }else
2552                             topright= NULL;
2553
2554                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2555                         nnz = h->non_zero_count_cache[ scan8[i] ];
2556                         if(nnz){
2557                             if(is_h264){
2558                                 if(nnz == 1 && h->mb[i*16])
2559                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2560                                 else
2561                                     idct_add(ptr, h->mb + i*16, linesize);
2562                             }else
2563                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2564                         }
2565                     }
2566                 }
2567             }else{
2568                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2569                 if(is_h264){
2570                     if(!transform_bypass)
2571                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2572                 }else
2573                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2574             }
2575             if(h->deblocking_filter)
2576                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2577         }else if(is_h264){
2578             hl_motion(h, dest_y, dest_cb, dest_cr,
2579                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2580                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2581                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2582         }
2583
2584
2585         if(!IS_INTRA4x4(mb_type)){
2586             if(is_h264){
2587                 if(IS_INTRA16x16(mb_type)){
2588                     for(i=0; i<16; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ])
2590                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2591                         else if(h->mb[i*16])
2592                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2593                     }
2594                 }else{
2595                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2596                     for(i=0; i<16; i+=di){
2597                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2598                         if(nnz){
2599                             if(nnz==1 && h->mb[i*16])
2600                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2601                             else
2602                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2603                         }
2604                     }
2605                 }
2606             }else{
2607                 for(i=0; i<16; i++){
2608                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2609                         uint8_t * const ptr= dest_y + block_offset[i];
2610                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2611                     }
2612                 }
2613             }
2614         }
2615
2616         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2617             uint8_t *dest[2] = {dest_cb, dest_cr};
2618             if(transform_bypass){
2619                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2620             }else{
2621                 idct_add = s->dsp.h264_idct_add;
2622                 idct_dc_add = s->dsp.h264_idct_dc_add;
2623                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2624                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2625             }
2626             if(is_h264){
2627                 for(i=16; i<16+8; i++){
2628                     if(h->non_zero_count_cache[ scan8[i] ])
2629                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2630                     else if(h->mb[i*16])
2631                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                 }
2633             }else{
2634                 for(i=16; i<16+8; i++){
2635                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2636                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2637                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2638                     }
2639                 }
2640             }
2641         }
2642     }
2643     if(h->deblocking_filter) {
2644         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2645         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2646         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2647         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2648         if (!simple && FRAME_MBAFF) {
2649             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2650         } else {
2651             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2652         }
2653     }
2654 }
2655
2656 /**
2657  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2658  */
2659 static void hl_decode_mb_simple(H264Context *h){
2660     hl_decode_mb_internal(h, 1);
2661 }
2662
2663 /**
2664  * Process a macroblock; this handles edge cases, such as interlacing.
2665  */
2666 static void av_noinline hl_decode_mb_complex(H264Context *h){
2667     hl_decode_mb_internal(h, 0);
2668 }
2669
2670 static void hl_decode_mb(H264Context *h){
2671     MpegEncContext * const s = &h->s;
2672     const int mb_xy= h->mb_xy;
2673     const int mb_type= s->current_picture.mb_type[mb_xy];
2674     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2675                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2676
2677     if(ENABLE_H264_ENCODER && !s->decode)
2678         return;
2679
2680     if (is_complex)
2681         hl_decode_mb_complex(h);
2682     else hl_decode_mb_simple(h);
2683 }
2684
2685 static void pic_as_field(Picture *pic, const int parity){
2686     int i;
2687     for (i = 0; i < 4; ++i) {
2688         if (parity == PICT_BOTTOM_FIELD)
2689             pic->data[i] += pic->linesize[i];
2690         pic->reference = parity;
2691         pic->linesize[i] *= 2;
2692     }
2693     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2694 }
2695
2696 static int split_field_copy(Picture *dest, Picture *src,
2697                             int parity, int id_add){
2698     int match = !!(src->reference & parity);
2699
2700     if (match) {
2701         *dest = *src;
2702         if(parity != PICT_FRAME){
2703             pic_as_field(dest, parity);
2704             dest->pic_id *= 2;
2705             dest->pic_id += id_add;
2706         }
2707     }
2708
2709     return match;
2710 }
2711
2712 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2713     int i[2]={0};
2714     int index=0;
2715
2716     while(i[0]<len || i[1]<len){
2717         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2718             i[0]++;
2719         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2720             i[1]++;
2721         if(i[0] < len){
2722             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2723             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2724         }
2725         if(i[1] < len){
2726             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2727             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2728         }
2729     }
2730
2731     return index;
2732 }
2733
2734 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2735     int i, best_poc;
2736     int out_i= 0;
2737
2738     for(;;){
2739         best_poc= dir ? INT_MIN : INT_MAX;
2740
2741         for(i=0; i<len; i++){
2742             const int poc= src[i]->poc;
2743             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2744                 best_poc= poc;
2745                 sorted[out_i]= src[i];
2746             }
2747         }
2748         if(best_poc == (dir ? INT_MIN : INT_MAX))
2749             break;
2750         limit= sorted[out_i++]->poc - dir;
2751     }
2752     return out_i;
2753 }
2754
2755 /**
2756  * fills the default_ref_list.
2757  */
2758 static int fill_default_ref_list(H264Context *h){
2759     MpegEncContext * const s = &h->s;
2760     int i, len;
2761
2762     if(h->slice_type_nos==FF_B_TYPE){
2763         Picture *sorted[32];
2764         int cur_poc, list;
2765         int lens[2];
2766
2767         if(FIELD_PICTURE)
2768             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2769         else
2770             cur_poc= s->current_picture_ptr->poc;
2771
2772         for(list= 0; list<2; list++){
2773             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2774             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2775             assert(len<=32);
2776             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2777             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2778             assert(len<=32);
2779
2780             if(len < h->ref_count[list])
2781                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2782             lens[list]= len;
2783         }
2784
2785         if(lens[0] == lens[1] && lens[1] > 1){
2786             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2787             if(i == lens[0])
2788                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2789         }
2790     }else{
2791         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2792         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2793         assert(len <= 32);
2794         if(len < h->ref_count[0])
2795             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2796     }
2797 #ifdef TRACE
2798     for (i=0; i<h->ref_count[0]; i++) {
2799         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2800     }
2801     if(h->slice_type_nos==FF_B_TYPE){
2802         for (i=0; i<h->ref_count[1]; i++) {
2803             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2804         }
2805     }
2806 #endif
2807     return 0;
2808 }
2809
2810 static void print_short_term(H264Context *h);
2811 static void print_long_term(H264Context *h);
2812
2813 /**
2814  * Extract structure information about the picture described by pic_num in
2815  * the current decoding context (frame or field). Note that pic_num is
2816  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2817  * @param pic_num picture number for which to extract structure information
2818  * @param structure one of PICT_XXX describing structure of picture
2819  *                      with pic_num
2820  * @return frame number (short term) or long term index of picture
2821  *         described by pic_num
2822  */
2823 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2824     MpegEncContext * const s = &h->s;
2825
2826     *structure = s->picture_structure;
2827     if(FIELD_PICTURE){
2828         if (!(pic_num & 1))
2829             /* opposite field */
2830             *structure ^= PICT_FRAME;
2831         pic_num >>= 1;
2832     }
2833
2834     return pic_num;
2835 }
2836
2837 static int decode_ref_pic_list_reordering(H264Context *h){
2838     MpegEncContext * const s = &h->s;
2839     int list, index, pic_structure;
2840
2841     print_short_term(h);
2842     print_long_term(h);
2843
2844     for(list=0; list<h->list_count; list++){
2845         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2846
2847         if(get_bits1(&s->gb)){
2848             int pred= h->curr_pic_num;
2849
2850             for(index=0; ; index++){
2851                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2852                 unsigned int pic_id;
2853                 int i;
2854                 Picture *ref = NULL;
2855
2856                 if(reordering_of_pic_nums_idc==3)
2857                     break;
2858
2859                 if(index >= h->ref_count[list]){
2860                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2861                     return -1;
2862                 }
2863
2864                 if(reordering_of_pic_nums_idc<3){
2865                     if(reordering_of_pic_nums_idc<2){
2866                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2867                         int frame_num;
2868
2869                         if(abs_diff_pic_num > h->max_pic_num){
2870                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2871                             return -1;
2872                         }
2873
2874                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2875                         else                                pred+= abs_diff_pic_num;
2876                         pred &= h->max_pic_num - 1;
2877
2878                         frame_num = pic_num_extract(h, pred, &pic_structure);
2879
2880                         for(i= h->short_ref_count-1; i>=0; i--){
2881                             ref = h->short_ref[i];
2882                             assert(ref->reference);
2883                             assert(!ref->long_ref);
2884                             if(
2885                                    ref->frame_num == frame_num &&
2886                                    (ref->reference & pic_structure)
2887                               )
2888                                 break;
2889                         }
2890                         if(i>=0)
2891                             ref->pic_id= pred;
2892                     }else{
2893                         int long_idx;
2894                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2895
2896                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2897
2898                         if(long_idx>31){
2899                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2900                             return -1;
2901                         }
2902                         ref = h->long_ref[long_idx];
2903                         assert(!(ref && !ref->reference));
2904                         if(ref && (ref->reference & pic_structure)){
2905                             ref->pic_id= pic_id;
2906                             assert(ref->long_ref);
2907                             i=0;
2908                         }else{
2909                             i=-1;
2910                         }
2911                     }
2912
2913                     if (i < 0) {
2914                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2915                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2916                     } else {
2917                         for(i=index; i+1<h->ref_count[list]; i++){
2918                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2919                                 break;
2920                         }
2921                         for(; i > index; i--){
2922                             h->ref_list[list][i]= h->ref_list[list][i-1];
2923                         }
2924                         h->ref_list[list][index]= *ref;
2925                         if (FIELD_PICTURE){
2926                             pic_as_field(&h->ref_list[list][index], pic_structure);
2927                         }
2928                     }
2929                 }else{
2930                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2931                     return -1;
2932                 }
2933             }
2934         }
2935     }
2936     for(list=0; list<h->list_count; list++){
2937         for(index= 0; index < h->ref_count[list]; index++){
2938             if(!h->ref_list[list][index].data[0]){
2939                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2940                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2941             }
2942         }
2943     }
2944
2945     return 0;
2946 }
2947
2948 static void fill_mbaff_ref_list(H264Context *h){
2949     int list, i, j;
2950     for(list=0; list<2; list++){ //FIXME try list_count
2951         for(i=0; i<h->ref_count[list]; i++){
2952             Picture *frame = &h->ref_list[list][i];
2953             Picture *field = &h->ref_list[list][16+2*i];
2954             field[0] = *frame;
2955             for(j=0; j<3; j++)
2956                 field[0].linesize[j] <<= 1;
2957             field[0].reference = PICT_TOP_FIELD;
2958             field[0].poc= field[0].field_poc[0];
2959             field[1] = field[0];
2960             for(j=0; j<3; j++)
2961                 field[1].data[j] += frame->linesize[j];
2962             field[1].reference = PICT_BOTTOM_FIELD;
2963             field[1].poc= field[1].field_poc[1];
2964
2965             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2966             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2967             for(j=0; j<2; j++){
2968                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2969                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2970             }
2971         }
2972     }
2973     for(j=0; j<h->ref_count[1]; j++){
2974         for(i=0; i<h->ref_count[0]; i++)
2975             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2976         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2977         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2978     }
2979 }
2980
2981 static int pred_weight_table(H264Context *h){
2982     MpegEncContext * const s = &h->s;
2983     int list, i;
2984     int luma_def, chroma_def;
2985
2986     h->use_weight= 0;
2987     h->use_weight_chroma= 0;
2988     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2989     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2990     luma_def = 1<<h->luma_log2_weight_denom;
2991     chroma_def = 1<<h->chroma_log2_weight_denom;
2992
2993     for(list=0; list<2; list++){
2994         for(i=0; i<h->ref_count[list]; i++){
2995             int luma_weight_flag, chroma_weight_flag;
2996
2997             luma_weight_flag= get_bits1(&s->gb);
2998             if(luma_weight_flag){
2999                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3000                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3001                 if(   h->luma_weight[list][i] != luma_def
3002                    || h->luma_offset[list][i] != 0)
3003                     h->use_weight= 1;
3004             }else{
3005                 h->luma_weight[list][i]= luma_def;
3006                 h->luma_offset[list][i]= 0;
3007             }
3008
3009             if(CHROMA){
3010                 chroma_weight_flag= get_bits1(&s->gb);
3011                 if(chroma_weight_flag){
3012                     int j;
3013                     for(j=0; j<2; j++){
3014                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3015                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3016                         if(   h->chroma_weight[list][i][j] != chroma_def
3017                         || h->chroma_offset[list][i][j] != 0)
3018                             h->use_weight_chroma= 1;
3019                     }
3020                 }else{
3021                     int j;
3022                     for(j=0; j<2; j++){
3023                         h->chroma_weight[list][i][j]= chroma_def;
3024                         h->chroma_offset[list][i][j]= 0;
3025                     }
3026                 }
3027             }
3028         }
3029         if(h->slice_type_nos != FF_B_TYPE) break;
3030     }
3031     h->use_weight= h->use_weight || h->use_weight_chroma;
3032     return 0;
3033 }
3034
3035 static void implicit_weight_table(H264Context *h){
3036     MpegEncContext * const s = &h->s;
3037     int ref0, ref1;
3038     int cur_poc = s->current_picture_ptr->poc;
3039
3040     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3041        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3042         h->use_weight= 0;
3043         h->use_weight_chroma= 0;
3044         return;
3045     }
3046
3047     h->use_weight= 2;
3048     h->use_weight_chroma= 2;
3049     h->luma_log2_weight_denom= 5;
3050     h->chroma_log2_weight_denom= 5;
3051
3052     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3053         int poc0 = h->ref_list[0][ref0].poc;
3054         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3055             int poc1 = h->ref_list[1][ref1].poc;
3056             int td = av_clip(poc1 - poc0, -128, 127);
3057             if(td){
3058                 int tb = av_clip(cur_poc - poc0, -128, 127);
3059                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3060                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3061                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3062                     h->implicit_weight[ref0][ref1] = 32;
3063                 else
3064                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3065             }else
3066                 h->implicit_weight[ref0][ref1] = 32;
3067         }
3068     }
3069 }
3070
3071 /**
3072  * Mark a picture as no longer needed for reference. The refmask
3073  * argument allows unreferencing of individual fields or the whole frame.
3074  * If the picture becomes entirely unreferenced, but is being held for
3075  * display purposes, it is marked as such.
3076  * @param refmask mask of fields to unreference; the mask is bitwise
3077  *                anded with the reference marking of pic
3078  * @return non-zero if pic becomes entirely unreferenced (except possibly
3079  *         for display purposes) zero if one of the fields remains in
3080  *         reference
3081  */
3082 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3083     int i;
3084     if (pic->reference &= refmask) {
3085         return 0;
3086     } else {
3087         for(i = 0; h->delayed_pic[i]; i++)
3088             if(pic == h->delayed_pic[i]){
3089                 pic->reference=DELAYED_PIC_REF;
3090                 break;
3091             }
3092         return 1;
3093     }
3094 }
3095
3096 /**
3097  * instantaneous decoder refresh.
3098  */
3099 static void idr(H264Context *h){
3100     int i;
3101
3102     for(i=0; i<16; i++){
3103         remove_long(h, i, 0);
3104     }
3105     assert(h->long_ref_count==0);
3106
3107     for(i=0; i<h->short_ref_count; i++){
3108         unreference_pic(h, h->short_ref[i], 0);
3109         h->short_ref[i]= NULL;
3110     }
3111     h->short_ref_count=0;
3112     h->prev_frame_num= 0;
3113     h->prev_frame_num_offset= 0;
3114     h->prev_poc_msb=
3115     h->prev_poc_lsb= 0;
3116 }
3117
3118 /* forget old pics after a seek */
3119 static void flush_dpb(AVCodecContext *avctx){
3120     H264Context *h= avctx->priv_data;
3121     int i;
3122     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3123         if(h->delayed_pic[i])
3124             h->delayed_pic[i]->reference= 0;
3125         h->delayed_pic[i]= NULL;
3126     }
3127     h->outputed_poc= INT_MIN;
3128     idr(h);
3129     if(h->s.current_picture_ptr)
3130         h->s.current_picture_ptr->reference= 0;
3131     h->s.first_field= 0;
3132     ff_mpeg_flush(avctx);
3133 }
3134
3135 /**
3136  * Find a Picture in the short term reference list by frame number.
3137  * @param frame_num frame number to search for
3138  * @param idx the index into h->short_ref where returned picture is found
3139  *            undefined if no picture found.
3140  * @return pointer to the found picture, or NULL if no pic with the provided
3141  *                 frame number is found
3142  */
3143 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3144     MpegEncContext * const s = &h->s;
3145     int i;
3146
3147     for(i=0; i<h->short_ref_count; i++){
3148         Picture *pic= h->short_ref[i];
3149         if(s->avctx->debug&FF_DEBUG_MMCO)
3150             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3151         if(pic->frame_num == frame_num) {
3152             *idx = i;
3153             return pic;
3154         }
3155     }
3156     return NULL;
3157 }
3158
3159 /**
3160  * Remove a picture from the short term reference list by its index in
3161  * that list.  This does no checking on the provided index; it is assumed
3162  * to be valid. Other list entries are shifted down.
3163  * @param i index into h->short_ref of picture to remove.
3164  */
3165 static void remove_short_at_index(H264Context *h, int i){
3166     assert(i >= 0 && i < h->short_ref_count);
3167     h->short_ref[i]= NULL;
3168     if (--h->short_ref_count)
3169         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3170 }
3171
3172 /**
3173  *
3174  * @return the removed picture or NULL if an error occurs
3175  */
3176 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3177     MpegEncContext * const s = &h->s;
3178     Picture *pic;
3179     int i;
3180
3181     if(s->avctx->debug&FF_DEBUG_MMCO)
3182         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3183
3184     pic = find_short(h, frame_num, &i);
3185     if (pic){
3186         if(unreference_pic(h, pic, ref_mask))
3187         remove_short_at_index(h, i);
3188     }
3189
3190     return pic;
3191 }
3192
3193 /**
3194  * Remove a picture from the long term reference list by its index in
3195  * that list.
3196  * @return the removed picture or NULL if an error occurs
3197  */
3198 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3199     Picture *pic;
3200
3201     pic= h->long_ref[i];
3202     if (pic){
3203         if(unreference_pic(h, pic, ref_mask)){
3204             assert(h->long_ref[i]->long_ref == 1);
3205             h->long_ref[i]->long_ref= 0;
3206             h->long_ref[i]= NULL;
3207             h->long_ref_count--;
3208         }
3209     }
3210
3211     return pic;
3212 }
3213
3214 /**
3215  * print short term list
3216  */
3217 static void print_short_term(H264Context *h) {
3218     uint32_t i;
3219     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3220         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3221         for(i=0; i<h->short_ref_count; i++){
3222             Picture *pic= h->short_ref[i];
3223             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3224         }
3225     }
3226 }
3227
3228 /**
3229  * print long term list
3230  */
3231 static void print_long_term(H264Context *h) {
3232     uint32_t i;
3233     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3234         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3235         for(i = 0; i < 16; i++){
3236             Picture *pic= h->long_ref[i];
3237             if (pic) {
3238                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3239             }
3240         }
3241     }
3242 }
3243
3244 /**
3245  * Executes the reference picture marking (memory management control operations).
3246  */
3247 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3248     MpegEncContext * const s = &h->s;
3249     int i, j;
3250     int current_ref_assigned=0;
3251     Picture *pic;
3252
3253     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3254         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3255
3256     for(i=0; i<mmco_count; i++){
3257         int structure, frame_num;
3258         if(s->avctx->debug&FF_DEBUG_MMCO)
3259             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3260
3261         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3262            || mmco[i].opcode == MMCO_SHORT2LONG){
3263             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3264             pic = find_short(h, frame_num, &j);
3265             if(!pic){
3266                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3267                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3268                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3269                 continue;
3270             }
3271         }
3272
3273         switch(mmco[i].opcode){
3274         case MMCO_SHORT2UNUSED:
3275             if(s->avctx->debug&FF_DEBUG_MMCO)
3276                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3277             remove_short(h, frame_num, structure ^ PICT_FRAME);
3278             break;
3279         case MMCO_SHORT2LONG:
3280                 if (h->long_ref[mmco[i].long_arg] != pic)
3281                     remove_long(h, mmco[i].long_arg, 0);
3282
3283                 remove_short_at_index(h, j);
3284                 h->long_ref[ mmco[i].long_arg ]= pic;
3285                 if (h->long_ref[ mmco[i].long_arg ]){
3286                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3287                     h->long_ref_count++;
3288                 }
3289             break;
3290         case MMCO_LONG2UNUSED:
3291             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3292             pic = h->long_ref[j];
3293             if (pic) {
3294                 remove_long(h, j, structure ^ PICT_FRAME);
3295             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3296                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3297             break;
3298         case MMCO_LONG:
3299                     // Comment below left from previous code as it is an interresting note.
3300                     /* First field in pair is in short term list or
3301                      * at a different long term index.
3302                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3303                      * Report the problem and keep the pair where it is,
3304                      * and mark this field valid.
3305                      */
3306
3307             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3308                 remove_long(h, mmco[i].long_arg, 0);
3309
3310                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3311                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3312                 h->long_ref_count++;
3313             }
3314
3315             s->current_picture_ptr->reference |= s->picture_structure;
3316             current_ref_assigned=1;
3317             break;
3318         case MMCO_SET_MAX_LONG:
3319             assert(mmco[i].long_arg <= 16);
3320             // just remove the long term which index is greater than new max
3321             for(j = mmco[i].long_arg; j<16; j++){
3322                 remove_long(h, j, 0);
3323             }
3324             break;
3325         case MMCO_RESET:
3326             while(h->short_ref_count){
3327                 remove_short(h, h->short_ref[0]->frame_num, 0);
3328             }
3329             for(j = 0; j < 16; j++) {
3330                 remove_long(h, j, 0);
3331             }
3332             s->current_picture_ptr->poc=
3333             s->current_picture_ptr->field_poc[0]=
3334             s->current_picture_ptr->field_poc[1]=
3335             h->poc_lsb=
3336             h->poc_msb=
3337             h->frame_num=
3338             s->current_picture_ptr->frame_num= 0;
3339             break;
3340         default: assert(0);
3341         }
3342     }
3343
3344     if (!current_ref_assigned) {
3345         /* Second field of complementary field pair; the first field of
3346          * which is already referenced. If short referenced, it
3347          * should be first entry in short_ref. If not, it must exist
3348          * in long_ref; trying to put it on the short list here is an
3349          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3350          */
3351         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3352             /* Just mark the second field valid */
3353             s->current_picture_ptr->reference = PICT_FRAME;
3354         } else if (s->current_picture_ptr->long_ref) {
3355             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3356                                              "assignment for second field "
3357                                              "in complementary field pair "
3358                                              "(first field is long term)\n");
3359         } else {
3360             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3361             if(pic){
3362                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3363             }
3364
3365             if(h->short_ref_count)
3366                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3367
3368             h->short_ref[0]= s->current_picture_ptr;
3369             h->short_ref_count++;
3370             s->current_picture_ptr->reference |= s->picture_structure;
3371         }
3372     }
3373
3374     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3375
3376         /* We have too many reference frames, probably due to corrupted
3377          * stream. Need to discard one frame. Prevents overrun of the
3378          * short_ref and long_ref buffers.
3379          */
3380         av_log(h->s.avctx, AV_LOG_ERROR,
3381                "number of reference frames exceeds max (probably "
3382                "corrupt input), discarding one\n");
3383
3384         if (h->long_ref_count && !h->short_ref_count) {
3385             for (i = 0; i < 16; ++i)
3386                 if (h->long_ref[i])
3387                     break;
3388
3389             assert(i < 16);
3390             remove_long(h, i, 0);
3391         } else {
3392             pic = h->short_ref[h->short_ref_count - 1];
3393             remove_short(h, pic->frame_num, 0);
3394         }
3395     }
3396
3397     print_short_term(h);
3398     print_long_term(h);
3399     return 0;
3400 }
3401
3402 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3403     MpegEncContext * const s = &h->s;
3404     int i;
3405
3406     h->mmco_index= 0;
3407     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3408         s->broken_link= get_bits1(gb) -1;
3409         if(get_bits1(gb)){
3410             h->mmco[0].opcode= MMCO_LONG;
3411             h->mmco[0].long_arg= 0;
3412             h->mmco_index= 1;
3413         }
3414     }else{
3415         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3416             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3417                 MMCOOpcode opcode= get_ue_golomb(gb);
3418
3419                 h->mmco[i].opcode= opcode;
3420                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3421                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3422 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3423                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3424                         return -1;
3425                     }*/
3426                 }
3427                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3428                     unsigned int long_arg= get_ue_golomb(gb);
3429                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3430                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3431                         return -1;
3432                     }
3433                     h->mmco[i].long_arg= long_arg;
3434                 }
3435
3436                 if(opcode > (unsigned)MMCO_LONG){
3437                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3438                     return -1;
3439                 }
3440                 if(opcode == MMCO_END)
3441                     break;
3442             }
3443             h->mmco_index= i;
3444         }else{
3445             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3446
3447             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3448                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3449                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3450                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3451                 h->mmco_index= 1;
3452                 if (FIELD_PICTURE) {
3453                     h->mmco[0].short_pic_num *= 2;
3454                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3455                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3456                     h->mmco_index= 2;
3457                 }
3458             }
3459         }
3460     }
3461
3462     return 0;
3463 }
3464
3465 static int init_poc(H264Context *h){
3466     MpegEncContext * const s = &h->s;
3467     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3468     int field_poc[2];
3469     Picture *cur = s->current_picture_ptr;
3470
3471     h->frame_num_offset= h->prev_frame_num_offset;
3472     if(h->frame_num < h->prev_frame_num)
3473         h->frame_num_offset += max_frame_num;
3474
3475     if(h->sps.poc_type==0){
3476         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3477
3478         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3479             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3480         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3481             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3482         else
3483             h->poc_msb = h->prev_poc_msb;
3484 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3485         field_poc[0] =
3486         field_poc[1] = h->poc_msb + h->poc_lsb;
3487         if(s->picture_structure == PICT_FRAME)
3488             field_poc[1] += h->delta_poc_bottom;
3489     }else if(h->sps.poc_type==1){
3490         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3491         int i;
3492
3493         if(h->sps.poc_cycle_length != 0)
3494             abs_frame_num = h->frame_num_offset + h->frame_num;
3495         else
3496             abs_frame_num = 0;
3497
3498         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3499             abs_frame_num--;
3500
3501         expected_delta_per_poc_cycle = 0;
3502         for(i=0; i < h->sps.poc_cycle_length; i++)
3503             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3504
3505         if(abs_frame_num > 0){
3506             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3507             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3508
3509             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3510             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3511                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3512         } else
3513             expectedpoc = 0;
3514
3515         if(h->nal_ref_idc == 0)
3516             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3517
3518         field_poc[0] = expectedpoc + h->delta_poc[0];
3519         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3520
3521         if(s->picture_structure == PICT_FRAME)
3522             field_poc[1] += h->delta_poc[1];
3523     }else{
3524         int poc= 2*(h->frame_num_offset + h->frame_num);
3525
3526         if(!h->nal_ref_idc)
3527             poc--;
3528
3529         field_poc[0]= poc;
3530         field_poc[1]= poc;
3531     }
3532
3533     if(s->picture_structure != PICT_BOTTOM_FIELD)
3534         s->current_picture_ptr->field_poc[0]= field_poc[0];
3535     if(s->picture_structure != PICT_TOP_FIELD)
3536         s->current_picture_ptr->field_poc[1]= field_poc[1];
3537     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3538
3539     return 0;
3540 }
3541
3542
3543 /**
3544  * initialize scan tables
3545  */
3546 static void init_scan_tables(H264Context *h){
3547     MpegEncContext * const s = &h->s;
3548     int i;
3549     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3550         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3551         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3552     }else{
3553         for(i=0; i<16; i++){
3554 #define T(x) (x>>2) | ((x<<2) & 0xF)
3555             h->zigzag_scan[i] = T(zigzag_scan[i]);
3556             h-> field_scan[i] = T( field_scan[i]);
3557 #undef T
3558         }
3559     }
3560     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3561         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3562         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3563         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3564         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3565     }else{
3566         for(i=0; i<64; i++){
3567 #define T(x) (x>>3) | ((x&7)<<3)
3568             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3569             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3570             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3571             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3572 #undef T
3573         }
3574     }
3575     if(h->sps.transform_bypass){ //FIXME same ugly
3576         h->zigzag_scan_q0          = zigzag_scan;
3577         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3578         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3579         h->field_scan_q0           = field_scan;
3580         h->field_scan8x8_q0        = field_scan8x8;
3581         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3582     }else{
3583         h->zigzag_scan_q0          = h->zigzag_scan;
3584         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3585         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3586         h->field_scan_q0           = h->field_scan;
3587         h->field_scan8x8_q0        = h->field_scan8x8;
3588         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3589     }
3590 }
3591
3592 /**
3593  * Replicates H264 "master" context to thread contexts.
3594  */
3595 static void clone_slice(H264Context *dst, H264Context *src)
3596 {
3597     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3598     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3599     dst->s.current_picture      = src->s.current_picture;
3600     dst->s.linesize             = src->s.linesize;
3601     dst->s.uvlinesize           = src->s.uvlinesize;
3602     dst->s.first_field          = src->s.first_field;
3603
3604     dst->prev_poc_msb           = src->prev_poc_msb;
3605     dst->prev_poc_lsb           = src->prev_poc_lsb;
3606     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3607     dst->prev_frame_num         = src->prev_frame_num;
3608     dst->short_ref_count        = src->short_ref_count;
3609
3610     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3611     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3612     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3613     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3614
3615     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3616     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3617 }
3618
3619 /**
3620  * decodes a slice header.
3621  * This will also call MPV_common_init() and frame_start() as needed.
3622  *
3623  * @param h h264context
3624  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3625  *
3626  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3627  */
3628 static int decode_slice_header(H264Context *h, H264Context *h0){
3629     MpegEncContext * const s = &h->s;
3630     MpegEncContext * const s0 = &h0->s;
3631     unsigned int first_mb_in_slice;
3632     unsigned int pps_id;
3633     int num_ref_idx_active_override_flag;
3634     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3635     unsigned int slice_type, tmp, i, j;
3636     int default_ref_list_done = 0;
3637     int last_pic_structure;
3638
3639     s->dropable= h->nal_ref_idc == 0;
3640
3641     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3642         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3643         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3644     }else{
3645         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3646         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3647     }
3648
3649     first_mb_in_slice= get_ue_golomb(&s->gb);
3650
3651     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3652         h0->current_slice = 0;
3653         if (!s0->first_field)
3654             s->current_picture_ptr= NULL;
3655     }
3656
3657     slice_type= get_ue_golomb(&s->gb);
3658     if(slice_type > 9){
3659         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3660         return -1;
3661     }
3662     if(slice_type > 4){
3663         slice_type -= 5;
3664         h->slice_type_fixed=1;
3665     }else
3666         h->slice_type_fixed=0;
3667
3668     slice_type= slice_type_map[ slice_type ];
3669     if (slice_type == FF_I_TYPE
3670         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3671         default_ref_list_done = 1;
3672     }
3673     h->slice_type= slice_type;
3674     h->slice_type_nos= slice_type & 3;
3675
3676     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3677     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3678         av_log(h->s.avctx, AV_LOG_ERROR,
3679                "B picture before any references, skipping\n");
3680         return -1;
3681     }
3682
3683     pps_id= get_ue_golomb(&s->gb);
3684     if(pps_id>=MAX_PPS_COUNT){
3685         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3686         return -1;
3687     }
3688     if(!h0->pps_buffers[pps_id]) {
3689         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3690         return -1;
3691     }
3692     h->pps= *h0->pps_buffers[pps_id];
3693
3694     if(!h0->sps_buffers[h->pps.sps_id]) {
3695         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3696         return -1;
3697     }
3698     h->sps = *h0->sps_buffers[h->pps.sps_id];
3699
3700     if(h == h0 && h->dequant_coeff_pps != pps_id){
3701         h->dequant_coeff_pps = pps_id;
3702         init_dequant_tables(h);
3703     }
3704
3705     s->mb_width= h->sps.mb_width;
3706     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3707
3708     h->b_stride=  s->mb_width*4;
3709     h->b8_stride= s->mb_width*2;
3710
3711     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3712     if(h->sps.frame_mbs_only_flag)
3713         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3714     else
3715         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3716
3717     if (s->context_initialized
3718         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3719         if(h != h0)
3720             return -1;   // width / height changed during parallelized decoding
3721         free_tables(h);
3722         MPV_common_end(s);
3723     }
3724     if (!s->context_initialized) {
3725         if(h != h0)
3726             return -1;  // we cant (re-)initialize context during parallel decoding
3727         if (MPV_common_init(s) < 0)
3728             return -1;
3729         s->first_field = 0;
3730
3731         init_scan_tables(h);
3732         alloc_tables(h);
3733
3734         for(i = 1; i < s->avctx->thread_count; i++) {
3735             H264Context *c;
3736             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3737             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3738             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3739             c->sps = h->sps;
3740             c->pps = h->pps;
3741             init_scan_tables(c);
3742             clone_tables(c, h);
3743         }
3744
3745         for(i = 0; i < s->avctx->thread_count; i++)
3746             if(context_init(h->thread_context[i]) < 0)
3747                 return -1;
3748
3749         s->avctx->width = s->width;
3750         s->avctx->height = s->height;
3751         s->avctx->sample_aspect_ratio= h->sps.sar;
3752         if(!s->avctx->sample_aspect_ratio.den)
3753             s->avctx->sample_aspect_ratio.den = 1;
3754
3755         if(h->sps.timing_info_present_flag){
3756             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3757             if(h->x264_build > 0 && h->x264_build < 44)
3758                 s->avctx->time_base.den *= 2;
3759             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3760                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3761         }
3762     }
3763
3764     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3765
3766     h->mb_mbaff = 0;
3767     h->mb_aff_frame = 0;
3768     last_pic_structure = s0->picture_structure;
3769     if(h->sps.frame_mbs_only_flag){
3770         s->picture_structure= PICT_FRAME;
3771     }else{
3772         if(get_bits1(&s->gb)) { //field_pic_flag
3773             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3774         } else {
3775             s->picture_structure= PICT_FRAME;
3776             h->mb_aff_frame = h->sps.mb_aff;
3777         }
3778     }
3779     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3780
3781     if(h0->current_slice == 0){
3782         while(h->frame_num !=  h->prev_frame_num &&
3783               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3784             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3785             frame_start(h);
3786             h->prev_frame_num++;
3787             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3788             s->current_picture_ptr->frame_num= h->prev_frame_num;
3789             execute_ref_pic_marking(h, NULL, 0);
3790         }
3791
3792         /* See if we have a decoded first field looking for a pair... */
3793         if (s0->first_field) {
3794             assert(s0->current_picture_ptr);
3795             assert(s0->current_picture_ptr->data[0]);
3796             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3797
3798             /* figure out if we have a complementary field pair */
3799             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3800                 /*
3801                  * Previous field is unmatched. Don't display it, but let it
3802                  * remain for reference if marked as such.
3803                  */
3804                 s0->current_picture_ptr = NULL;
3805                 s0->first_field = FIELD_PICTURE;
3806
3807             } else {
3808                 if (h->nal_ref_idc &&
3809                         s0->current_picture_ptr->reference &&
3810                         s0->current_picture_ptr->frame_num != h->frame_num) {
3811                     /*
3812                      * This and previous field were reference, but had
3813                      * different frame_nums. Consider this field first in
3814                      * pair. Throw away previous field except for reference
3815                      * purposes.
3816                      */
3817                     s0->first_field = 1;
3818                     s0->current_picture_ptr = NULL;
3819
3820                 } else {
3821                     /* Second field in complementary pair */
3822                     s0->first_field = 0;
3823                 }
3824             }
3825
3826         } else {
3827             /* Frame or first field in a potentially complementary pair */
3828             assert(!s0->current_picture_ptr);
3829             s0->first_field = FIELD_PICTURE;
3830         }
3831
3832         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3833             s0->first_field = 0;
3834             return -1;
3835         }
3836     }
3837     if(h != h0)
3838         clone_slice(h, h0);
3839
3840     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3841
3842     assert(s->mb_num == s->mb_width * s->mb_height);
3843     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3844        first_mb_in_slice                    >= s->mb_num){
3845         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3846         return -1;
3847     }
3848     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3849     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3850     if (s->picture_structure == PICT_BOTTOM_FIELD)
3851         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3852     assert(s->mb_y < s->mb_height);
3853
3854     if(s->picture_structure==PICT_FRAME){
3855         h->curr_pic_num=   h->frame_num;
3856         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3857     }else{
3858         h->curr_pic_num= 2*h->frame_num + 1;
3859         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3860     }
3861
3862     if(h->nal_unit_type == NAL_IDR_SLICE){
3863         get_ue_golomb(&s->gb); /* idr_pic_id */
3864     }
3865
3866     if(h->sps.poc_type==0){
3867         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3868
3869         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3870             h->delta_poc_bottom= get_se_golomb(&s->gb);
3871         }
3872     }
3873
3874     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3875         h->delta_poc[0]= get_se_golomb(&s->gb);
3876
3877         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3878             h->delta_poc[1]= get_se_golomb(&s->gb);
3879     }
3880
3881     init_poc(h);
3882
3883     if(h->pps.redundant_pic_cnt_present){
3884         h->redundant_pic_count= get_ue_golomb(&s->gb);
3885     }
3886
3887     //set defaults, might be overridden a few lines later
3888     h->ref_count[0]= h->pps.ref_count[0];
3889     h->ref_count[1]= h->pps.ref_count[1];
3890
3891     if(h->slice_type_nos != FF_I_TYPE){
3892         if(h->slice_type_nos == FF_B_TYPE){
3893             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3894         }
3895         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3896
3897         if(num_ref_idx_active_override_flag){
3898             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3899             if(h->slice_type_nos==FF_B_TYPE)
3900                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3901
3902             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3903                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3904                 h->ref_count[0]= h->ref_count[1]= 1;
3905                 return -1;
3906             }
3907         }
3908         if(h->slice_type_nos == FF_B_TYPE)
3909             h->list_count= 2;
3910         else
3911             h->list_count= 1;
3912     }else
3913         h->list_count= 0;
3914
3915     if(!default_ref_list_done){
3916         fill_default_ref_list(h);
3917     }
3918
3919     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3920         return -1;
3921
3922     if(h->slice_type_nos!=FF_I_TYPE){
3923         s->last_picture_ptr= &h->ref_list[0][0];
3924         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3925     }
3926     if(h->slice_type_nos==FF_B_TYPE){
3927         s->next_picture_ptr= &h->ref_list[1][0];
3928         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3929     }
3930
3931     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3932        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3933         pred_weight_table(h);
3934     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3935         implicit_weight_table(h);
3936     else
3937         h->use_weight = 0;
3938
3939     if(h->nal_ref_idc)
3940         decode_ref_pic_marking(h0, &s->gb);
3941
3942     if(FRAME_MBAFF)
3943         fill_mbaff_ref_list(h);
3944
3945     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3946         direct_dist_scale_factor(h);
3947     direct_ref_list_init(h);
3948
3949     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3950         tmp = get_ue_golomb(&s->gb);
3951         if(tmp > 2){
3952             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3953             return -1;
3954         }
3955         h->cabac_init_idc= tmp;
3956     }
3957
3958     h->last_qscale_diff = 0;
3959     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3960     if(tmp>51){
3961         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3962         return -1;
3963     }
3964     s->qscale= tmp;
3965     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3966     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3967     //FIXME qscale / qp ... stuff
3968     if(h->slice_type == FF_SP_TYPE){
3969         get_bits1(&s->gb); /* sp_for_switch_flag */
3970     }
3971     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3972         get_se_golomb(&s->gb); /* slice_qs_delta */
3973     }
3974
3975     h->deblocking_filter = 1;
3976     h->slice_alpha_c0_offset = 0;
3977     h->slice_beta_offset = 0;
3978     if( h->pps.deblocking_filter_parameters_present ) {
3979         tmp= get_ue_golomb(&s->gb);
3980         if(tmp > 2){
3981             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3982             return -1;
3983         }
3984         h->deblocking_filter= tmp;
3985         if(h->deblocking_filter < 2)
3986             h->deblocking_filter^= 1; // 1<->0
3987
3988         if( h->deblocking_filter ) {
3989             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3990             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3991         }
3992     }
3993
3994     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3995        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3996        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3997        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3998         h->deblocking_filter= 0;
3999
4000     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4001         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4002             /* Cheat slightly for speed:
4003                Do not bother to deblock across slices. */
4004             h->deblocking_filter = 2;
4005         } else {
4006             h0->max_contexts = 1;
4007             if(!h0->single_decode_warning) {
4008                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4009                 h0->single_decode_warning = 1;
4010             }
4011             if(h != h0)
4012                 return 1; // deblocking switched inside frame
4013         }
4014     }
4015
4016 #if 0 //FMO
4017     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4018         slice_group_change_cycle= get_bits(&s->gb, ?);
4019 #endif
4020
4021     h0->last_slice_type = slice_type;
4022     h->slice_num = ++h0->current_slice;
4023     if(h->slice_num >= MAX_SLICES){
4024         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4025     }
4026
4027     for(j=0; j<2; j++){
4028         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4029         ref2frm[0]=
4030         ref2frm[1]= -1;
4031         for(i=0; i<16; i++)
4032             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4033                           +(h->ref_list[j][i].reference&3);
4034         ref2frm[18+0]=
4035         ref2frm[18+1]= -1;
4036         for(i=16; i<48; i++)
4037             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4038                           +(h->ref_list[j][i].reference&3);
4039     }
4040
4041     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4042     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4043
4044     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4045         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4046                h->slice_num,
4047                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4048                first_mb_in_slice,
4049                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4050                pps_id, h->frame_num,
4051                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4052                h->ref_count[0], h->ref_count[1],
4053                s->qscale,
4054                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4055                h->use_weight,
4056                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4057                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4058                );
4059     }
4060
4061     return 0;
4062 }
4063
4064 /**
4065  *
4066  */
4067 static inline int get_level_prefix(GetBitContext *gb){
4068     unsigned int buf;
4069     int log;
4070
4071     OPEN_READER(re, gb);
4072     UPDATE_CACHE(re, gb);
4073     buf=GET_CACHE(re, gb);
4074
4075     log= 32 - av_log2(buf);
4076 #ifdef TRACE
4077     print_bin(buf>>(32-log), log);
4078     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4079 #endif
4080
4081     LAST_SKIP_BITS(re, gb, log);
4082     CLOSE_READER(re, gb);
4083
4084     return log-1;
4085 }
4086
4087 static inline int get_dct8x8_allowed(H264Context *h){
4088     int i;
4089     for(i=0; i<4; i++){
4090         if(!IS_SUB_8X8(h->sub_mb_type[i])
4091            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4092             return 0;
4093     }
4094     return 1;
4095 }
4096
4097 /**
4098  * decodes a residual block.
4099  * @param n block index
4100  * @param scantable scantable
4101  * @param max_coeff number of coefficients in the block
4102  * @return <0 if an error occurred
4103  */
4104 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4105     MpegEncContext * const s = &h->s;
4106     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4107     int level[16];
4108     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4109
4110     //FIXME put trailing_onex into the context
4111
4112     if(n == CHROMA_DC_BLOCK_INDEX){
4113         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4114         total_coeff= coeff_token>>2;
4115     }else{
4116         if(n == LUMA_DC_BLOCK_INDEX){
4117             total_coeff= pred_non_zero_count(h, 0);
4118             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4119             total_coeff= coeff_token>>2;
4120         }else{
4121             total_coeff= pred_non_zero_count(h, n);
4122             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4123             total_coeff= coeff_token>>2;
4124             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4125         }
4126     }
4127
4128     //FIXME set last_non_zero?
4129
4130     if(total_coeff==0)
4131         return 0;
4132     if(total_coeff > (unsigned)max_coeff) {
4133         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4134         return -1;
4135     }
4136
4137     trailing_ones= coeff_token&3;
4138     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4139     assert(total_coeff<=16);
4140
4141     for(i=0; i<trailing_ones; i++){
4142         level[i]= 1 - 2*get_bits1(gb);
4143     }
4144
4145     if(i<total_coeff) {
4146         int level_code, mask;
4147         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4148         int prefix= get_level_prefix(gb);
4149
4150         //first coefficient has suffix_length equal to 0 or 1
4151         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4152             if(suffix_length)
4153                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4154             else
4155                 level_code= (prefix<<suffix_length); //part
4156         }else if(prefix==14){
4157             if(suffix_length)
4158                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4159             else
4160                 level_code= prefix + get_bits(gb, 4); //part
4161         }else{
4162             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4163             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4164             if(prefix>=16)
4165                 level_code += (1<<(prefix-3))-4096;
4166         }
4167
4168         if(trailing_ones < 3) level_code += 2;
4169
4170         suffix_length = 1;
4171         if(level_code > 5)
4172             suffix_length++;
4173         mask= -(level_code&1);
4174         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4175         i++;
4176
4177         //remaining coefficients have suffix_length > 0
4178         for(;i<total_coeff;i++) {
4179             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4180             prefix = get_level_prefix(gb);
4181             if(prefix<15){
4182                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4183             }else{
4184                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4185                 if(prefix>=16)
4186                     level_code += (1<<(prefix-3))-4096;
4187             }
4188             mask= -(level_code&1);
4189             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4190             if(level_code > suffix_limit[suffix_length])
4191                 suffix_length++;
4192         }
4193     }
4194
4195     if(total_coeff == max_coeff)
4196         zeros_left=0;
4197     else{
4198         if(n == CHROMA_DC_BLOCK_INDEX)
4199             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4200         else
4201             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4202     }
4203
4204     coeff_num = zeros_left + total_coeff - 1;
4205     j = scantable[coeff_num];
4206     if(n > 24){
4207         block[j] = level[0];
4208         for(i=1;i<total_coeff;i++) {
4209             if(zeros_left <= 0)
4210                 run_before = 0;
4211             else if(zeros_left < 7){
4212                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4213             }else{
4214                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4215             }
4216             zeros_left -= run_before;
4217             coeff_num -= 1 + run_before;
4218             j= scantable[ coeff_num ];
4219
4220             block[j]= level[i];
4221         }
4222     }else{
4223         block[j] = (level[0] * qmul[j] + 32)>>6;
4224         for(i=1;i<total_coeff;i++) {
4225             if(zeros_left <= 0)
4226                 run_before = 0;
4227             else if(zeros_left < 7){
4228                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4229             }else{
4230                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4231             }
4232             zeros_left -= run_before;
4233             coeff_num -= 1 + run_before;
4234             j= scantable[ coeff_num ];
4235
4236             block[j]= (level[i] * qmul[j] + 32)>>6;
4237         }
4238     }
4239
4240     if(zeros_left<0){
4241         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4242         return -1;
4243     }
4244
4245     return 0;
4246 }
4247
4248 static void predict_field_decoding_flag(H264Context *h){
4249     MpegEncContext * const s = &h->s;
4250     const int mb_xy= h->mb_xy;
4251     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4252                 ? s->current_picture.mb_type[mb_xy-1]
4253                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4254                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4255                 : 0;
4256     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4257 }
4258
4259 /**
4260  * decodes a P_SKIP or B_SKIP macroblock
4261  */
4262 static void decode_mb_skip(H264Context *h){
4263     MpegEncContext * const s = &h->s;
4264     const int mb_xy= h->mb_xy;
4265     int mb_type=0;
4266
4267     memset(h->non_zero_count[mb_xy], 0, 16);
4268     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4269
4270     if(MB_FIELD)
4271         mb_type|= MB_TYPE_INTERLACED;
4272
4273     if( h->slice_type_nos == FF_B_TYPE )
4274     {
4275         // just for fill_caches. pred_direct_motion will set the real mb_type
4276         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4277
4278         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4279         pred_direct_motion(h, &mb_type);
4280         mb_type|= MB_TYPE_SKIP;
4281     }
4282     else
4283     {
4284         int mx, my;
4285         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4286
4287         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4288         pred_pskip_motion(h, &mx, &my);
4289         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4290         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4291     }
4292
4293     write_back_motion(h, mb_type);
4294     s->current_picture.mb_type[mb_xy]= mb_type;
4295     s->current_picture.qscale_table[mb_xy]= s->qscale;
4296     h->slice_table[ mb_xy ]= h->slice_num;
4297     h->prev_mb_skipped= 1;
4298 }
4299
4300 /**
4301  * decodes a macroblock
4302  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4303  */
4304 static int decode_mb_cavlc(H264Context *h){
4305     MpegEncContext * const s = &h->s;
4306     int mb_xy;
4307     int partition_count;
4308     unsigned int mb_type, cbp;
4309     int dct8x8_allowed= h->pps.transform_8x8_mode;
4310
4311     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4312
4313     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4314
4315     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4316     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4317                 down the code */
4318     if(h->slice_type_nos != FF_I_TYPE){
4319         if(s->mb_skip_run==-1)
4320             s->mb_skip_run= get_ue_golomb(&s->gb);
4321
4322         if (s->mb_skip_run--) {
4323             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4324                 if(s->mb_skip_run==0)
4325                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4326                 else
4327                     predict_field_decoding_flag(h);
4328             }
4329             decode_mb_skip(h);
4330             return 0;
4331         }
4332     }
4333     if(FRAME_MBAFF){
4334         if( (s->mb_y&1) == 0 )
4335             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4336     }
4337
4338     h->prev_mb_skipped= 0;
4339
4340     mb_type= get_ue_golomb(&s->gb);
4341     if(h->slice_type_nos == FF_B_TYPE){
4342         if(mb_type < 23){
4343             partition_count= b_mb_type_info[mb_type].partition_count;
4344             mb_type=         b_mb_type_info[mb_type].type;
4345         }else{
4346             mb_type -= 23;
4347             goto decode_intra_mb;
4348         }
4349     }else if(h->slice_type_nos == FF_P_TYPE){
4350         if(mb_type < 5){
4351             partition_count= p_mb_type_info[mb_type].partition_count;
4352             mb_type=         p_mb_type_info[mb_type].type;
4353         }else{
4354             mb_type -= 5;
4355             goto decode_intra_mb;
4356         }
4357     }else{
4358        assert(h->slice_type_nos == FF_I_TYPE);
4359         if(h->slice_type == FF_SI_TYPE && mb_type)
4360             mb_type--;
4361 decode_intra_mb:
4362         if(mb_type > 25){
4363             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4364             return -1;
4365         }
4366         partition_count=0;
4367         cbp= i_mb_type_info[mb_type].cbp;
4368         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4369         mb_type= i_mb_type_info[mb_type].type;
4370     }
4371
4372     if(MB_FIELD)
4373         mb_type |= MB_TYPE_INTERLACED;
4374
4375     h->slice_table[ mb_xy ]= h->slice_num;
4376
4377     if(IS_INTRA_PCM(mb_type)){
4378         unsigned int x;
4379
4380         // We assume these blocks are very rare so we do not optimize it.
4381         align_get_bits(&s->gb);
4382
4383         // The pixels are stored in the same order as levels in h->mb array.
4384         for(x=0; x < (CHROMA ? 384 : 256); x++){
4385             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4386         }
4387
4388         // In deblocking, the quantizer is 0
4389         s->current_picture.qscale_table[mb_xy]= 0;
4390         // All coeffs are present
4391         memset(h->non_zero_count[mb_xy], 16, 16);
4392
4393         s->current_picture.mb_type[mb_xy]= mb_type;
4394         return 0;
4395     }
4396
4397     if(MB_MBAFF){
4398         h->ref_count[0] <<= 1;
4399         h->ref_count[1] <<= 1;
4400     }
4401
4402     fill_caches(h, mb_type, 0);
4403
4404     //mb_pred
4405     if(IS_INTRA(mb_type)){
4406         int pred_mode;
4407 //            init_top_left_availability(h);
4408         if(IS_INTRA4x4(mb_type)){
4409             int i;
4410             int di = 1;
4411             if(dct8x8_allowed && get_bits1(&s->gb)){
4412                 mb_type |= MB_TYPE_8x8DCT;
4413                 di = 4;
4414             }
4415
4416 //                fill_intra4x4_pred_table(h);
4417             for(i=0; i<16; i+=di){
4418                 int mode= pred_intra_mode(h, i);
4419
4420                 if(!get_bits1(&s->gb)){
4421                     const int rem_mode= get_bits(&s->gb, 3);
4422                     mode = rem_mode + (rem_mode >= mode);
4423                 }
4424
4425                 if(di==4)
4426                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4427                 else
4428                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4429             }
4430             write_back_intra_pred_mode(h);
4431             if( check_intra4x4_pred_mode(h) < 0)
4432                 return -1;
4433         }else{
4434             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4435             if(h->intra16x16_pred_mode < 0)
4436                 return -1;
4437         }
4438         if(CHROMA){
4439             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4440             if(pred_mode < 0)
4441                 return -1;
4442             h->chroma_pred_mode= pred_mode;
4443         }
4444     }else if(partition_count==4){
4445         int i, j, sub_partition_count[4], list, ref[2][4];
4446
4447         if(h->slice_type_nos == FF_B_TYPE){
4448             for(i=0; i<4; i++){
4449                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4450                 if(h->sub_mb_type[i] >=13){
4451                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4452                     return -1;
4453                 }
4454                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4455                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4456             }
4457             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4458                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4459                 pred_direct_motion(h, &mb_type);
4460                 h->ref_cache[0][scan8[4]] =
4461                 h->ref_cache[1][scan8[4]] =
4462                 h->ref_cache[0][scan8[12]] =
4463                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4464             }
4465         }else{
4466             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4467             for(i=0; i<4; i++){
4468                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4469                 if(h->sub_mb_type[i] >=4){
4470                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4471                     return -1;
4472                 }
4473                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4474                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4475             }
4476         }
4477
4478         for(list=0; list<h->list_count; list++){
4479             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4480             for(i=0; i<4; i++){
4481                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4482                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4483                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4484                     if(tmp>=ref_count){
4485                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4486                         return -1;
4487                     }
4488                     ref[list][i]= tmp;
4489                 }else{
4490                  //FIXME
4491                     ref[list][i] = -1;
4492                 }
4493             }
4494         }
4495
4496         if(dct8x8_allowed)
4497             dct8x8_allowed = get_dct8x8_allowed(h);
4498
4499         for(list=0; list<h->list_count; list++){
4500             for(i=0; i<4; i++){
4501                 if(IS_DIRECT(h->sub_mb_type[i])) {
4502                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4503                     continue;
4504                 }
4505                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4506                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4507
4508                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4509                     const int sub_mb_type= h->sub_mb_type[i];
4510                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4511                     for(j=0; j<sub_partition_count[i]; j++){
4512                         int mx, my;
4513                         const int index= 4*i + block_width*j;
4514                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4515                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4516                         mx += get_se_golomb(&s->gb);
4517                         my += get_se_golomb(&s->gb);
4518                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4519
4520                         if(IS_SUB_8X8(sub_mb_type)){
4521                             mv_cache[ 1 ][0]=
4522                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4523                             mv_cache[ 1 ][1]=
4524                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4525                         }else if(IS_SUB_8X4(sub_mb_type)){
4526                             mv_cache[ 1 ][0]= mx;
4527                             mv_cache[ 1 ][1]= my;
4528                         }else if(IS_SUB_4X8(sub_mb_type)){
4529                             mv_cache[ 8 ][0]= mx;
4530                             mv_cache[ 8 ][1]= my;
4531                         }
4532                         mv_cache[ 0 ][0]= mx;
4533                         mv_cache[ 0 ][1]= my;
4534                     }
4535                 }else{
4536                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4537                     p[0] = p[1]=
4538                     p[8] = p[9]= 0;
4539                 }
4540             }
4541         }
4542     }else if(IS_DIRECT(mb_type)){
4543         pred_direct_motion(h, &mb_type);
4544         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4545     }else{
4546         int list, mx, my, i;
4547          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4548         if(IS_16X16(mb_type)){
4549             for(list=0; list<h->list_count; list++){
4550                     unsigned int val;
4551                     if(IS_DIR(mb_type, 0, list)){
4552                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4553                         if(val >= h->ref_count[list]){
4554                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4555                             return -1;
4556                         }
4557                     }else
4558                         val= LIST_NOT_USED&0xFF;
4559                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4560             }
4561             for(list=0; list<h->list_count; list++){
4562                 unsigned int val;
4563                 if(IS_DIR(mb_type, 0, list)){
4564                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4565                     mx += get_se_golomb(&s->gb);
4566                     my += get_se_golomb(&s->gb);
4567                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4568
4569                     val= pack16to32(mx,my);
4570                 }else
4571                     val=0;
4572                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4573             }
4574         }
4575         else if(IS_16X8(mb_type)){
4576             for(list=0; list<h->list_count; list++){
4577                     for(i=0; i<2; i++){
4578                         unsigned int val;
4579                         if(IS_DIR(mb_type, i, list)){
4580                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4581                             if(val >= h->ref_count[list]){
4582                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4583                                 return -1;
4584                             }
4585                         }else
4586                             val= LIST_NOT_USED&0xFF;
4587                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4588                     }
4589             }
4590             for(list=0; list<h->list_count; list++){
4591                 for(i=0; i<2; i++){
4592                     unsigned int val;
4593                     if(IS_DIR(mb_type, i, list)){
4594                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4595                         mx += get_se_golomb(&s->gb);
4596                         my += get_se_golomb(&s->gb);
4597                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4598
4599                         val= pack16to32(mx,my);
4600                     }else
4601                         val=0;
4602                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4603                 }
4604             }
4605         }else{
4606             assert(IS_8X16(mb_type));
4607             for(list=0; list<h->list_count; list++){
4608                     for(i=0; i<2; i++){
4609                         unsigned int val;
4610                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4611                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4612                             if(val >= h->ref_count[list]){
4613                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4614                                 return -1;
4615                             }
4616                         }else
4617                             val= LIST_NOT_USED&0xFF;
4618                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4619                     }
4620             }
4621             for(list=0; list<h->list_count; list++){
4622                 for(i=0; i<2; i++){
4623                     unsigned int val;
4624                     if(IS_DIR(mb_type, i, list)){
4625                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4626                         mx += get_se_golomb(&s->gb);
4627                         my += get_se_golomb(&s->gb);
4628                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4629
4630                         val= pack16to32(mx,my);
4631                     }else
4632                         val=0;
4633                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4634                 }
4635             }
4636         }
4637     }
4638
4639     if(IS_INTER(mb_type))
4640         write_back_motion(h, mb_type);
4641
4642     if(!IS_INTRA16x16(mb_type)){
4643         cbp= get_ue_golomb(&s->gb);
4644         if(cbp > 47){
4645             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4646             return -1;
4647         }
4648
4649         if(CHROMA){
4650             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4651             else                     cbp= golomb_to_inter_cbp   [cbp];
4652         }else{
4653             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4654             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4655         }
4656     }
4657     h->cbp = cbp;
4658
4659     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4660         if(get_bits1(&s->gb)){
4661             mb_type |= MB_TYPE_8x8DCT;
4662             h->cbp_table[mb_xy]= cbp;
4663         }
4664     }
4665     s->current_picture.mb_type[mb_xy]= mb_type;
4666
4667     if(cbp || IS_INTRA16x16(mb_type)){
4668         int i8x8, i4x4, chroma_idx;
4669         int dquant;
4670         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4671         const uint8_t *scan, *scan8x8, *dc_scan;
4672
4673 //        fill_non_zero_count_cache(h);
4674
4675         if(IS_INTERLACED(mb_type)){
4676             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4677             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4678             dc_scan= luma_dc_field_scan;
4679         }else{
4680             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4681             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4682             dc_scan= luma_dc_zigzag_scan;
4683         }
4684
4685         dquant= get_se_golomb(&s->gb);
4686
4687         if( dquant > 25 || dquant < -26 ){
4688             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4689             return -1;
4690         }
4691
4692         s->qscale += dquant;
4693         if(((unsigned)s->qscale) > 51){
4694             if(s->qscale<0) s->qscale+= 52;
4695             else            s->qscale-= 52;
4696         }
4697
4698         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4699         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4700         if(IS_INTRA16x16(mb_type)){
4701             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4702                 return -1; //FIXME continue if partitioned and other return -1 too
4703             }
4704
4705             assert((cbp&15) == 0 || (cbp&15) == 15);
4706
4707             if(cbp&15){
4708                 for(i8x8=0; i8x8<4; i8x8++){
4709                     for(i4x4=0; i4x4<4; i4x4++){
4710                         const int index= i4x4 + 4*i8x8;
4711                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4712                             return -1;
4713                         }
4714                     }
4715                 }
4716             }else{
4717                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4718             }
4719         }else{
4720             for(i8x8=0; i8x8<4; i8x8++){
4721                 if(cbp & (1<<i8x8)){
4722                     if(IS_8x8DCT(mb_type)){
4723                         DCTELEM *buf = &h->mb[64*i8x8];
4724                         uint8_t *nnz;
4725                         for(i4x4=0; i4x4<4; i4x4++){
4726                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4727                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4728                                 return -1;
4729                         }
4730                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4731                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4732                     }else{
4733                         for(i4x4=0; i4x4<4; i4x4++){
4734                             const int index= i4x4 + 4*i8x8;
4735
4736                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4737                                 return -1;
4738                             }
4739                         }
4740                     }
4741                 }else{
4742                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4743                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4744                 }
4745             }
4746         }
4747
4748         if(cbp&0x30){
4749             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4750                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4751                     return -1;
4752                 }
4753         }
4754
4755         if(cbp&0x20){
4756             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4757                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4758                 for(i4x4=0; i4x4<4; i4x4++){
4759                     const int index= 16 + 4*chroma_idx + i4x4;
4760                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4761                         return -1;
4762                     }
4763                 }
4764             }
4765         }else{
4766             uint8_t * const nnz= &h->non_zero_count_cache[0];
4767             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4768             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4769         }
4770     }else{
4771         uint8_t * const nnz= &h->non_zero_count_cache[0];
4772         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4773         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4774         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4775     }
4776     s->current_picture.qscale_table[mb_xy]= s->qscale;
4777     write_back_non_zero_count(h);
4778
4779     if(MB_MBAFF){
4780         h->ref_count[0] >>= 1;
4781         h->ref_count[1] >>= 1;
4782     }
4783
4784     return 0;
4785 }
4786
4787 static int decode_cabac_field_decoding_flag(H264Context *h) {
4788     MpegEncContext * const s = &h->s;
4789     const int mb_x = s->mb_x;
4790     const int mb_y = s->mb_y & ~1;
4791     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4792     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4793
4794     unsigned int ctx = 0;
4795
4796     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4797         ctx += 1;
4798     }
4799     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4800         ctx += 1;
4801     }
4802
4803     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4804 }
4805
4806 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4807     uint8_t *state= &h->cabac_state[ctx_base];
4808     int mb_type;
4809
4810     if(intra_slice){
4811         MpegEncContext * const s = &h->s;
4812         const int mba_xy = h->left_mb_xy[0];
4813         const int mbb_xy = h->top_mb_xy;
4814         int ctx=0;
4815         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4816             ctx++;
4817         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4818             ctx++;
4819         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4820             return 0;   /* I4x4 */
4821         state += 2;
4822     }else{
4823         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4824             return 0;   /* I4x4 */
4825     }
4826
4827     if( get_cabac_terminate( &h->cabac ) )
4828         return 25;  /* PCM */
4829
4830     mb_type = 1; /* I16x16 */
4831     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4832     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4833         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4834     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4835     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4836     return mb_type;
4837 }
4838
4839 static int decode_cabac_mb_type( H264Context *h ) {
4840     MpegEncContext * const s = &h->s;
4841
4842     if( h->slice_type_nos == FF_I_TYPE ) {
4843         return decode_cabac_intra_mb_type(h, 3, 1);
4844     } else if( h->slice_type_nos == FF_P_TYPE ) {
4845         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4846             /* P-type */
4847             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4848                 /* P_L0_D16x16, P_8x8 */
4849                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4850             } else {
4851                 /* P_L0_D8x16, P_L0_D16x8 */
4852                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4853             }
4854         } else {
4855             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4856         }
4857     } else if( h->slice_type_nos == FF_B_TYPE ) {
4858         const int mba_xy = h->left_mb_xy[0];
4859         const int mbb_xy = h->top_mb_xy;
4860         int ctx = 0;
4861         int bits;
4862
4863         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4864             ctx++;
4865         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4866             ctx++;
4867
4868         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4869             return 0; /* B_Direct_16x16 */
4870
4871         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4872             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4873         }
4874
4875         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4876         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4877         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4878         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4879         if( bits < 8 )
4880             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4881         else if( bits == 13 ) {
4882             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4883         } else if( bits == 14 )
4884             return 11; /* B_L1_L0_8x16 */
4885         else if( bits == 15 )
4886             return 22; /* B_8x8 */
4887
4888         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4889         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4890     } else {
4891         /* TODO SI/SP frames? */
4892         return -1;
4893     }
4894 }
4895
4896 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4897     MpegEncContext * const s = &h->s;
4898     int mba_xy, mbb_xy;
4899     int ctx = 0;
4900
4901     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4902         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4903         mba_xy = mb_xy - 1;
4904         if( (mb_y&1)
4905             && h->slice_table[mba_xy] == h->slice_num
4906             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4907             mba_xy += s->mb_stride;
4908         if( MB_FIELD ){
4909             mbb_xy = mb_xy - s->mb_stride;
4910             if( !(mb_y&1)
4911                 && h->slice_table[mbb_xy] == h->slice_num
4912                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4913                 mbb_xy -= s->mb_stride;
4914         }else
4915             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4916     }else{
4917         int mb_xy = h->mb_xy;
4918         mba_xy = mb_xy - 1;
4919         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4920     }
4921
4922     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4923         ctx++;
4924     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4925         ctx++;
4926
4927     if( h->slice_type_nos == FF_B_TYPE )
4928         ctx += 13;
4929     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4930 }
4931
4932 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4933     int mode = 0;
4934
4935     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4936         return pred_mode;
4937
4938     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4939     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4940     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4941
4942     if( mode >= pred_mode )
4943         return mode + 1;
4944     else
4945         return mode;
4946 }
4947
4948 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4949     const int mba_xy = h->left_mb_xy[0];
4950     const int mbb_xy = h->top_mb_xy;
4951
4952     int ctx = 0;
4953
4954     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4955     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4956         ctx++;
4957
4958     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4959         ctx++;
4960
4961     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4962         return 0;
4963
4964     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4965         return 1;
4966     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4967         return 2;
4968     else
4969         return 3;
4970 }
4971
4972 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4973     int cbp_b, cbp_a, ctx, cbp = 0;
4974
4975     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4976     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4977
4978     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4979     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4980     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4981     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4982     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4983     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4984     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4985     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4986     return cbp;
4987 }
4988 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4989     int ctx;
4990     int cbp_a, cbp_b;
4991
4992     cbp_a = (h->left_cbp>>4)&0x03;
4993     cbp_b = (h-> top_cbp>>4)&0x03;
4994
4995     ctx = 0;
4996     if( cbp_a > 0 ) ctx++;
4997     if( cbp_b > 0 ) ctx += 2;
4998     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4999         return 0;
5000
5001     ctx = 4;
5002     if( cbp_a == 2 ) ctx++;
5003     if( cbp_b == 2 ) ctx += 2;
5004     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5005 }
5006 static int decode_cabac_mb_dqp( H264Context *h) {
5007     int   ctx = 0;
5008     int   val = 0;
5009
5010     if( h->last_qscale_diff != 0 )
5011         ctx++;
5012
5013     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5014         if( ctx < 2 )
5015             ctx = 2;
5016         else
5017             ctx = 3;
5018         val++;
5019         if(val > 102) //prevent infinite loop
5020             return INT_MIN;
5021     }
5022
5023     if( val&0x01 )
5024         return (val + 1)/2;
5025     else
5026         return -(val + 1)/2;
5027 }
5028 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5029     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5030         return 0;   /* 8x8 */
5031     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5032         return 1;   /* 8x4 */
5033     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5034         return 2;   /* 4x8 */
5035     return 3;       /* 4x4 */
5036 }
5037 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5038     int type;
5039     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5040         return 0;   /* B_Direct_8x8 */
5041     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5042         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5043     type = 3;
5044     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5045         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5046             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5047         type += 4;
5048     }
5049     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5050     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5051     return type;
5052 }
5053
5054 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5055     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5056 }
5057
5058 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5059     int refa = h->ref_cache[list][scan8[n] - 1];
5060     int refb = h->ref_cache[list][scan8[n] - 8];
5061     int ref  = 0;
5062     int ctx  = 0;
5063
5064     if( h->slice_type_nos == FF_B_TYPE) {
5065         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5066             ctx++;
5067         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5068             ctx += 2;
5069     } else {
5070         if( refa > 0 )
5071             ctx++;
5072         if( refb > 0 )
5073             ctx += 2;
5074     }
5075
5076     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5077         ref++;
5078         if( ctx < 4 )
5079             ctx = 4;
5080         else
5081             ctx = 5;
5082         if(ref >= 32 /*h->ref_list[list]*/){
5083             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5084             return 0; //FIXME we should return -1 and check the return everywhere
5085         }
5086     }
5087     return ref;
5088 }
5089
5090 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5091     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5092                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5093     int ctxbase = (l == 0) ? 40 : 47;
5094     int ctx, mvd;
5095
5096     if( amvd < 3 )
5097         ctx = 0;
5098     else if( amvd > 32 )
5099         ctx = 2;
5100     else
5101         ctx = 1;
5102
5103     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5104         return 0;
5105
5106     mvd= 1;
5107     ctx= 3;
5108     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5109         mvd++;
5110         if( ctx < 6 )
5111             ctx++;
5112     }
5113
5114     if( mvd >= 9 ) {
5115         int k = 3;
5116         while( get_cabac_bypass( &h->cabac ) ) {
5117             mvd += 1 << k;
5118             k++;
5119             if(k>24){
5120                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5121                 return INT_MIN;
5122             }
5123         }
5124         while( k-- ) {
5125             if( get_cabac_bypass( &h->cabac ) )
5126                 mvd += 1 << k;
5127         }
5128     }
5129     return get_cabac_bypass_sign( &h->cabac, -mvd );
5130 }
5131
5132 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5133     int nza, nzb;
5134     int ctx = 0;
5135
5136     if( is_dc ) {
5137         if( cat == 0 ) {
5138             nza = h->left_cbp&0x100;
5139             nzb = h-> top_cbp&0x100;
5140         } else {
5141             nza = (h->left_cbp>>(6+idx))&0x01;
5142             nzb = (h-> top_cbp>>(6+idx))&0x01;
5143         }
5144     } else {
5145         if( cat == 4 ) {
5146             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5147             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5148         } else {
5149             assert(cat == 1 || cat == 2);
5150             nza = h->non_zero_count_cache[scan8[idx] - 1];
5151             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5152         }
5153     }
5154
5155     if( nza > 0 )
5156         ctx++;
5157
5158     if( nzb > 0 )
5159         ctx += 2;
5160
5161     return ctx + 4 * cat;
5162 }
5163
5164 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5165     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5166     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5167     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5168     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5169 };
5170
5171 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5172     static const int significant_coeff_flag_offset[2][6] = {
5173       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5174       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5175     };
5176     static const int last_coeff_flag_offset[2][6] = {
5177       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5178       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5179     };
5180     static const int coeff_abs_level_m1_offset[6] = {
5181         227+0, 227+10, 227+20, 227+30, 227+39, 426
5182     };
5183     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5184       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5185         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5186         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5187        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5188       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5189         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5190         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5191         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5192     };
5193     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5194      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5195      * map node ctx => cabac ctx for level=1 */
5196     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5197     /* map node ctx => cabac ctx for level>1 */
5198     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5199     static const uint8_t coeff_abs_level_transition[2][8] = {
5200     /* update node ctx after decoding a level=1 */
5201         { 1, 2, 3, 3, 4, 5, 6, 7 },
5202     /* update node ctx after decoding a level>1 */
5203         { 4, 4, 4, 4, 5, 6, 7, 7 }
5204     };
5205
5206     int index[64];
5207
5208     int av_unused last;
5209     int coeff_count = 0;
5210     int node_ctx = 0;
5211
5212     uint8_t *significant_coeff_ctx_base;
5213     uint8_t *last_coeff_ctx_base;
5214     uint8_t *abs_level_m1_ctx_base;
5215
5216 #ifndef ARCH_X86
5217 #define CABAC_ON_STACK
5218 #endif
5219 #ifdef CABAC_ON_STACK
5220 #define CC &cc
5221     CABACContext cc;
5222     cc.range     = h->cabac.range;
5223     cc.low       = h->cabac.low;
5224     cc.bytestream= h->cabac.bytestream;
5225 #else
5226 #define CC &h->cabac
5227 #endif
5228
5229
5230     /* cat: 0-> DC 16x16  n = 0
5231      *      1-> AC 16x16  n = luma4x4idx
5232      *      2-> Luma4x4   n = luma4x4idx
5233      *      3-> DC Chroma n = iCbCr
5234      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5235      *      5-> Luma8x8   n = 4 * luma8x8idx
5236      */
5237
5238     /* read coded block flag */
5239     if( is_dc || cat != 5 ) {
5240         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5241             if( !is_dc ) {
5242                 if( cat == 4 )
5243                     h->non_zero_count_cache[scan8[16+n]] = 0;
5244                 else
5245                     h->non_zero_count_cache[scan8[n]] = 0;
5246             }
5247
5248 #ifdef CABAC_ON_STACK
5249             h->cabac.range     = cc.range     ;
5250             h->cabac.low       = cc.low       ;
5251             h->cabac.bytestream= cc.bytestream;
5252 #endif
5253             return;
5254         }
5255     }
5256
5257     significant_coeff_ctx_base = h->cabac_state
5258         + significant_coeff_flag_offset[MB_FIELD][cat];
5259     last_coeff_ctx_base = h->cabac_state
5260         + last_coeff_flag_offset[MB_FIELD][cat];
5261     abs_level_m1_ctx_base = h->cabac_state
5262         + coeff_abs_level_m1_offset[cat];
5263
5264     if( !is_dc && cat == 5 ) {
5265 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5266         for(last= 0; last < coefs; last++) { \
5267             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5268             if( get_cabac( CC, sig_ctx )) { \
5269                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5270                 index[coeff_count++] = last; \
5271                 if( get_cabac( CC, last_ctx ) ) { \
5272                     last= max_coeff; \
5273                     break; \
5274                 } \
5275             } \
5276         }\
5277         if( last == max_coeff -1 ) {\
5278             index[coeff_count++] = last;\
5279         }
5280         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5281 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5282         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5283     } else {
5284         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5285 #else
5286         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5287     } else {
5288         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5289 #endif
5290     }
5291     assert(coeff_count > 0);
5292
5293     if( is_dc ) {
5294         if( cat == 0 )
5295             h->cbp_table[h->mb_xy] |= 0x100;
5296         else
5297             h->cbp_table[h->mb_xy] |= 0x40 << n;
5298     } else {
5299         if( cat == 5 )
5300             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5301         else if( cat == 4 )
5302             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5303         else {
5304             assert( cat == 1 || cat == 2 );
5305             h->non_zero_count_cache[scan8[n]] = coeff_count;
5306         }
5307     }
5308
5309     do {
5310         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5311
5312         int j= scantable[index[--coeff_count]];
5313
5314         if( get_cabac( CC, ctx ) == 0 ) {
5315             node_ctx = coeff_abs_level_transition[0][node_ctx];
5316             if( is_dc ) {
5317                 block[j] = get_cabac_bypass_sign( CC, -1);
5318             }else{
5319                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5320             }
5321         } else {
5322             int coeff_abs = 2;
5323             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5324             node_ctx = coeff_abs_level_transition[1][node_ctx];
5325
5326             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5327                 coeff_abs++;
5328             }
5329
5330             if( coeff_abs >= 15 ) {
5331                 int j = 0;
5332                 while( get_cabac_bypass( CC ) ) {
5333                     j++;
5334                 }
5335
5336                 coeff_abs=1;
5337                 while( j-- ) {
5338                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5339                 }
5340                 coeff_abs+= 14;
5341             }
5342
5343             if( is_dc ) {
5344                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5345             }else{
5346                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5347             }
5348         }
5349     } while( coeff_count );
5350 #ifdef CABAC_ON_STACK
5351             h->cabac.range     = cc.range     ;
5352             h->cabac.low       = cc.low       ;
5353             h->cabac.bytestream= cc.bytestream;
5354 #endif
5355
5356 }
5357
5358 #ifndef CONFIG_SMALL
5359 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5360     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5361 }
5362
5363 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5364     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5365 }
5366 #endif
5367
5368 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5369 #ifdef CONFIG_SMALL
5370     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5371 #else
5372     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5373     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5374 #endif
5375 }
5376
5377 static inline void compute_mb_neighbors(H264Context *h)
5378 {
5379     MpegEncContext * const s = &h->s;
5380     const int mb_xy  = h->mb_xy;
5381     h->top_mb_xy     = mb_xy - s->mb_stride;
5382     h->left_mb_xy[0] = mb_xy - 1;
5383     if(FRAME_MBAFF){
5384         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5385         const int top_pair_xy      = pair_xy     - s->mb_stride;
5386         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5387         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5388         const int curr_mb_frame_flag = !MB_FIELD;
5389         const int bottom = (s->mb_y & 1);
5390         if (bottom
5391                 ? !curr_mb_frame_flag // bottom macroblock
5392                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5393                 ) {
5394             h->top_mb_xy -= s->mb_stride;
5395         }
5396         if (left_mb_frame_flag != curr_mb_frame_flag) {
5397             h->left_mb_xy[0] = pair_xy - 1;
5398         }
5399     } else if (FIELD_PICTURE) {
5400         h->top_mb_xy -= s->mb_stride;
5401     }
5402     return;
5403 }
5404
5405 /**
5406  * decodes a macroblock
5407  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5408  */
5409 static int decode_mb_cabac(H264Context *h) {
5410     MpegEncContext * const s = &h->s;
5411     int mb_xy;
5412     int mb_type, partition_count, cbp = 0;
5413     int dct8x8_allowed= h->pps.transform_8x8_mode;
5414
5415     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5416
5417     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5418
5419     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5420     if( h->slice_type_nos != FF_I_TYPE ) {
5421         int skip;
5422         /* a skipped mb needs the aff flag from the following mb */
5423         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5424             predict_field_decoding_flag(h);
5425         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5426             skip = h->next_mb_skipped;
5427         else
5428             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5429         /* read skip flags */
5430         if( skip ) {
5431             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5432                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5433                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5434                 if(h->next_mb_skipped)
5435                     predict_field_decoding_flag(h);
5436                 else
5437                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5438             }
5439
5440             decode_mb_skip(h);
5441
5442             h->cbp_table[mb_xy] = 0;
5443             h->chroma_pred_mode_table[mb_xy] = 0;
5444             h->last_qscale_diff = 0;
5445
5446             return 0;
5447
5448         }
5449     }
5450     if(FRAME_MBAFF){
5451         if( (s->mb_y&1) == 0 )
5452             h->mb_mbaff =
5453             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5454     }
5455
5456     h->prev_mb_skipped = 0;
5457
5458     compute_mb_neighbors(h);
5459     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5460         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5461         return -1;
5462     }
5463
5464     if( h->slice_type_nos == FF_B_TYPE ) {
5465         if( mb_type < 23 ){
5466             partition_count= b_mb_type_info[mb_type].partition_count;
5467             mb_type=         b_mb_type_info[mb_type].type;
5468         }else{
5469             mb_type -= 23;
5470             goto decode_intra_mb;
5471         }
5472     } else if( h->slice_type_nos == FF_P_TYPE ) {
5473         if( mb_type < 5) {
5474             partition_count= p_mb_type_info[mb_type].partition_count;
5475             mb_type=         p_mb_type_info[mb_type].type;
5476         } else {
5477             mb_type -= 5;
5478             goto decode_intra_mb;
5479         }
5480     } else {
5481         if(h->slice_type == FF_SI_TYPE && mb_type)
5482             mb_type--;
5483         assert(h->slice_type_nos == FF_I_TYPE);
5484 decode_intra_mb:
5485         partition_count = 0;
5486         cbp= i_mb_type_info[mb_type].cbp;
5487         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5488         mb_type= i_mb_type_info[mb_type].type;
5489     }
5490     if(MB_FIELD)
5491         mb_type |= MB_TYPE_INTERLACED;
5492
5493     h->slice_table[ mb_xy ]= h->slice_num;
5494
5495     if(IS_INTRA_PCM(mb_type)) {
5496         const uint8_t *ptr;
5497
5498         // We assume these blocks are very rare so we do not optimize it.
5499         // FIXME The two following lines get the bitstream position in the cabac
5500         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5501         ptr= h->cabac.bytestream;
5502         if(h->cabac.low&0x1) ptr--;
5503         if(CABAC_BITS==16){
5504             if(h->cabac.low&0x1FF) ptr--;
5505         }
5506
5507         // The pixels are stored in the same order as levels in h->mb array.
5508         memcpy(h->mb, ptr, 256); ptr+=256;
5509         if(CHROMA){
5510             memcpy(h->mb+128, ptr, 128); ptr+=128;
5511         }
5512
5513         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5514
5515         // All blocks are present
5516         h->cbp_table[mb_xy] = 0x1ef;
5517         h->chroma_pred_mode_table[mb_xy] = 0;
5518         // In deblocking, the quantizer is 0
5519         s->current_picture.qscale_table[mb_xy]= 0;
5520         // All coeffs are present
5521         memset(h->non_zero_count[mb_xy], 16, 16);
5522         s->current_picture.mb_type[mb_xy]= mb_type;
5523         h->last_qscale_diff = 0;
5524         return 0;
5525     }
5526
5527     if(MB_MBAFF){
5528         h->ref_count[0] <<= 1;
5529         h->ref_count[1] <<= 1;
5530     }
5531
5532     fill_caches(h, mb_type, 0);
5533
5534     if( IS_INTRA( mb_type ) ) {
5535         int i, pred_mode;
5536         if( IS_INTRA4x4( mb_type ) ) {
5537             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5538                 mb_type |= MB_TYPE_8x8DCT;
5539                 for( i = 0; i < 16; i+=4 ) {
5540                     int pred = pred_intra_mode( h, i );
5541                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5542                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5543                 }
5544             } else {
5545                 for( i = 0; i < 16; i++ ) {
5546                     int pred = pred_intra_mode( h, i );
5547                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5548
5549                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5550                 }
5551             }
5552             write_back_intra_pred_mode(h);
5553             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5554         } else {
5555             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5556             if( h->intra16x16_pred_mode < 0 ) return -1;
5557         }
5558         if(CHROMA){
5559             h->chroma_pred_mode_table[mb_xy] =
5560             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5561
5562             pred_mode= check_intra_pred_mode( h, pred_mode );
5563             if( pred_mode < 0 ) return -1;
5564             h->chroma_pred_mode= pred_mode;
5565         }
5566     } else if( partition_count == 4 ) {
5567         int i, j, sub_partition_count[4], list, ref[2][4];
5568
5569         if( h->slice_type_nos == FF_B_TYPE ) {
5570             for( i = 0; i < 4; i++ ) {
5571                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5572                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5573                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5574             }
5575             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5576                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5577                 pred_direct_motion(h, &mb_type);
5578                 h->ref_cache[0][scan8[4]] =
5579                 h->ref_cache[1][scan8[4]] =
5580                 h->ref_cache[0][scan8[12]] =
5581                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5582                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5583                     for( i = 0; i < 4; i++ )
5584                         if( IS_DIRECT(h->sub_mb_type[i]) )
5585                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5586                 }
5587             }
5588         } else {
5589             for( i = 0; i < 4; i++ ) {
5590                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5591                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5592                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5593             }
5594         }
5595
5596         for( list = 0; list < h->list_count; list++ ) {
5597                 for( i = 0; i < 4; i++ ) {
5598                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5599                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5600                         if( h->ref_count[list] > 1 )
5601                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5602                         else
5603                             ref[list][i] = 0;
5604                     } else {
5605                         ref[list][i] = -1;
5606                     }
5607                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5608                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5609                 }
5610         }
5611
5612         if(dct8x8_allowed)
5613             dct8x8_allowed = get_dct8x8_allowed(h);
5614
5615         for(list=0; list<h->list_count; list++){
5616             for(i=0; i<4; i++){
5617                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5618                 if(IS_DIRECT(h->sub_mb_type[i])){
5619                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5620                     continue;
5621                 }
5622
5623                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5624                     const int sub_mb_type= h->sub_mb_type[i];
5625                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5626                     for(j=0; j<sub_partition_count[i]; j++){
5627                         int mpx, mpy;
5628                         int mx, my;
5629                         const int index= 4*i + block_width*j;
5630                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5631                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5632                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5633
5634                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5635                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5636                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5637
5638                         if(IS_SUB_8X8(sub_mb_type)){
5639                             mv_cache[ 1 ][0]=
5640                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5641                             mv_cache[ 1 ][1]=
5642                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5643
5644                             mvd_cache[ 1 ][0]=
5645                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5646                             mvd_cache[ 1 ][1]=
5647                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5648                         }else if(IS_SUB_8X4(sub_mb_type)){
5649                             mv_cache[ 1 ][0]= mx;
5650                             mv_cache[ 1 ][1]= my;
5651
5652                             mvd_cache[ 1 ][0]= mx - mpx;
5653                             mvd_cache[ 1 ][1]= my - mpy;
5654                         }else if(IS_SUB_4X8(sub_mb_type)){
5655                             mv_cache[ 8 ][0]= mx;
5656                             mv_cache[ 8 ][1]= my;
5657
5658                             mvd_cache[ 8 ][0]= mx - mpx;
5659                             mvd_cache[ 8 ][1]= my - mpy;
5660                         }
5661                         mv_cache[ 0 ][0]= mx;
5662                         mv_cache[ 0 ][1]= my;
5663
5664                         mvd_cache[ 0 ][0]= mx - mpx;
5665                         mvd_cache[ 0 ][1]= my - mpy;
5666                     }
5667                 }else{
5668                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5669                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5670                     p[0] = p[1] = p[8] = p[9] = 0;
5671                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5672                 }
5673             }
5674         }
5675     } else if( IS_DIRECT(mb_type) ) {
5676         pred_direct_motion(h, &mb_type);
5677         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5678         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5679         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5680     } else {
5681         int list, mx, my, i, mpx, mpy;
5682         if(IS_16X16(mb_type)){
5683             for(list=0; list<h->list_count; list++){
5684                 if(IS_DIR(mb_type, 0, list)){
5685                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5686                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5687                 }else
5688                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5689             }
5690             for(list=0; list<h->list_count; list++){
5691                 if(IS_DIR(mb_type, 0, list)){
5692                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5693
5694                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5695                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5696                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5697
5698                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5699                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5700                 }else
5701                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5702             }
5703         }
5704         else if(IS_16X8(mb_type)){
5705             for(list=0; list<h->list_count; list++){
5706                     for(i=0; i<2; i++){
5707                         if(IS_DIR(mb_type, i, list)){
5708                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5709                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5710                         }else
5711                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5712                     }
5713             }
5714             for(list=0; list<h->list_count; list++){
5715                 for(i=0; i<2; i++){
5716                     if(IS_DIR(mb_type, i, list)){
5717                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5718                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5719                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5720                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5721
5722                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5723                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5724                     }else{
5725                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5726                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5727                     }
5728                 }
5729             }
5730         }else{
5731             assert(IS_8X16(mb_type));
5732             for(list=0; list<h->list_count; list++){
5733                     for(i=0; i<2; i++){
5734                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5735                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5736                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5737                         }else
5738                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5739                     }
5740             }
5741             for(list=0; list<h->list_count; list++){
5742                 for(i=0; i<2; i++){
5743                     if(IS_DIR(mb_type, i, list)){
5744                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5745                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5746                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5747
5748                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5749                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5750                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5751                     }else{
5752                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5753                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5754                     }
5755                 }
5756             }
5757         }
5758     }
5759
5760    if( IS_INTER( mb_type ) ) {
5761         h->chroma_pred_mode_table[mb_xy] = 0;
5762         write_back_motion( h, mb_type );
5763    }
5764
5765     if( !IS_INTRA16x16( mb_type ) ) {
5766         cbp  = decode_cabac_mb_cbp_luma( h );
5767         if(CHROMA)
5768             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5769     }
5770
5771     h->cbp_table[mb_xy] = h->cbp = cbp;
5772
5773     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5774         if( decode_cabac_mb_transform_size( h ) )
5775             mb_type |= MB_TYPE_8x8DCT;
5776     }
5777     s->current_picture.mb_type[mb_xy]= mb_type;
5778
5779     if( cbp || IS_INTRA16x16( mb_type ) ) {
5780         const uint8_t *scan, *scan8x8, *dc_scan;
5781         const uint32_t *qmul;
5782         int dqp;
5783
5784         if(IS_INTERLACED(mb_type)){
5785             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5786             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5787             dc_scan= luma_dc_field_scan;
5788         }else{
5789             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5790             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5791             dc_scan= luma_dc_zigzag_scan;
5792         }
5793
5794         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5795         if( dqp == INT_MIN ){
5796             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5797             return -1;
5798         }
5799         s->qscale += dqp;
5800         if(((unsigned)s->qscale) > 51){
5801             if(s->qscale<0) s->qscale+= 52;
5802             else            s->qscale-= 52;
5803         }
5804         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5805         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5806
5807         if( IS_INTRA16x16( mb_type ) ) {
5808             int i;
5809             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5810             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5811
5812             if( cbp&15 ) {
5813                 qmul = h->dequant4_coeff[0][s->qscale];
5814                 for( i = 0; i < 16; i++ ) {
5815                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5816                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5817                 }
5818             } else {
5819                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5820             }
5821         } else {
5822             int i8x8, i4x4;
5823             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5824                 if( cbp & (1<<i8x8) ) {
5825                     if( IS_8x8DCT(mb_type) ) {
5826                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5827                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5828                     } else {
5829                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5830                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5831                             const int index = 4*i8x8 + i4x4;
5832                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5833 //START_TIMER
5834                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5835 //STOP_TIMER("decode_residual")
5836                         }
5837                     }
5838                 } else {
5839                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5840                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5841                 }
5842             }
5843         }
5844
5845         if( cbp&0x30 ){
5846             int c;
5847             for( c = 0; c < 2; c++ ) {
5848                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5849                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5850             }
5851         }
5852
5853         if( cbp&0x20 ) {
5854             int c, i;
5855             for( c = 0; c < 2; c++ ) {
5856                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5857                 for( i = 0; i < 4; i++ ) {
5858                     const int index = 16 + 4 * c + i;
5859                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5860                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5861                 }
5862             }
5863         } else {
5864             uint8_t * const nnz= &h->non_zero_count_cache[0];
5865             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5866             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5867         }
5868     } else {
5869         uint8_t * const nnz= &h->non_zero_count_cache[0];
5870         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5871         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5872         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5873         h->last_qscale_diff = 0;
5874     }
5875
5876     s->current_picture.qscale_table[mb_xy]= s->qscale;
5877     write_back_non_zero_count(h);
5878
5879     if(MB_MBAFF){
5880         h->ref_count[0] >>= 1;
5881         h->ref_count[1] >>= 1;
5882     }
5883
5884     return 0;
5885 }
5886
5887
5888 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5889     int i, d;
5890     const int index_a = qp + h->slice_alpha_c0_offset;
5891     const int alpha = (alpha_table+52)[index_a];
5892     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5893
5894     if( bS[0] < 4 ) {
5895         int8_t tc[4];
5896         for(i=0; i<4; i++)
5897             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5898         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5899     } else {
5900         /* 16px edge length, because bS=4 is triggered by being at
5901          * the edge of an intra MB, so all 4 bS are the same */
5902             for( d = 0; d < 16; d++ ) {
5903                 const int p0 = pix[-1];
5904                 const int p1 = pix[-2];
5905                 const int p2 = pix[-3];
5906
5907                 const int q0 = pix[0];
5908                 const int q1 = pix[1];
5909                 const int q2 = pix[2];
5910
5911                 if( FFABS( p0 - q0 ) < alpha &&
5912                     FFABS( p1 - p0 ) < beta &&
5913                     FFABS( q1 - q0 ) < beta ) {
5914
5915                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5916                         if( FFABS( p2 - p0 ) < beta)
5917                         {
5918                             const int p3 = pix[-4];
5919                             /* p0', p1', p2' */
5920                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5921                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5922                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5923                         } else {
5924                             /* p0' */
5925                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5926                         }
5927                         if( FFABS( q2 - q0 ) < beta)
5928                         {
5929                             const int q3 = pix[3];
5930                             /* q0', q1', q2' */
5931                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5932                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5933                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5934                         } else {
5935                             /* q0' */
5936                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5937                         }
5938                     }else{
5939                         /* p0', q0' */
5940                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5941                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5942                     }
5943                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5944                 }
5945                 pix += stride;
5946             }
5947     }
5948 }
5949 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5950     int i;
5951     const int index_a = qp + h->slice_alpha_c0_offset;
5952     const int alpha = (alpha_table+52)[index_a];
5953     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5954
5955     if( bS[0] < 4 ) {
5956         int8_t tc[4];
5957         for(i=0; i<4; i++)
5958             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5959         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5960     } else {
5961         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5962     }
5963 }
5964
5965 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5966     int i;
5967     for( i = 0; i < 16; i++, pix += stride) {
5968         int index_a;
5969         int alpha;
5970         int beta;
5971
5972         int qp_index;
5973         int bS_index = (i >> 1);
5974         if (!MB_FIELD) {
5975             bS_index &= ~1;
5976             bS_index |= (i & 1);
5977         }
5978
5979         if( bS[bS_index] == 0 ) {
5980             continue;
5981         }
5982
5983         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5984         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5985         alpha = (alpha_table+52)[index_a];
5986         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5987
5988         if( bS[bS_index] < 4 ) {
5989             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5990             const int p0 = pix[-1];
5991             const int p1 = pix[-2];
5992             const int p2 = pix[-3];
5993             const int q0 = pix[0];
5994             const int q1 = pix[1];
5995             const int q2 = pix[2];
5996
5997             if( FFABS( p0 - q0 ) < alpha &&
5998                 FFABS( p1 - p0 ) < beta &&
5999                 FFABS( q1 - q0 ) < beta ) {
6000                 int tc = tc0;
6001                 int i_delta;
6002
6003                 if( FFABS( p2 - p0 ) < beta ) {
6004                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6005                     tc++;
6006                 }
6007                 if( FFABS( q2 - q0 ) < beta ) {
6008                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6009                     tc++;
6010                 }
6011
6012                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6013                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6014                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6015                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6016             }
6017         }else{
6018             const int p0 = pix[-1];
6019             const int p1 = pix[-2];
6020             const int p2 = pix[-3];
6021
6022             const int q0 = pix[0];
6023             const int q1 = pix[1];
6024             const int q2 = pix[2];
6025
6026             if( FFABS( p0 - q0 ) < alpha &&
6027                 FFABS( p1 - p0 ) < beta &&
6028                 FFABS( q1 - q0 ) < beta ) {
6029
6030                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6031                     if( FFABS( p2 - p0 ) < beta)
6032                     {
6033                         const int p3 = pix[-4];
6034                         /* p0', p1', p2' */
6035                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6036                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6037                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6038                     } else {
6039                         /* p0' */
6040                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6041                     }
6042                     if( FFABS( q2 - q0 ) < beta)
6043                     {
6044                         const int q3 = pix[3];
6045                         /* q0', q1', q2' */
6046                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6047                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6048                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6049                     } else {
6050                         /* q0' */
6051                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6052                     }
6053                 }else{
6054                     /* p0', q0' */
6055                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6056                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6057                 }
6058                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6059             }
6060         }
6061     }
6062 }
6063 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6064     int i;
6065     for( i = 0; i < 8; i++, pix += stride) {
6066         int index_a;
6067         int alpha;
6068         int beta;
6069
6070         int qp_index;
6071         int bS_index = i;
6072
6073         if( bS[bS_index] == 0 ) {
6074             continue;
6075         }
6076
6077         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6078         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6079         alpha = (alpha_table+52)[index_a];
6080         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6081
6082         if( bS[bS_index] < 4 ) {
6083             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6084             const int p0 = pix[-1];
6085             const int p1 = pix[-2];
6086             const int q0 = pix[0];
6087             const int q1 = pix[1];
6088
6089             if( FFABS( p0 - q0 ) < alpha &&
6090                 FFABS( p1 - p0 ) < beta &&
6091                 FFABS( q1 - q0 ) < beta ) {
6092                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6093
6094                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6095                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6096                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6097             }
6098         }else{
6099             const int p0 = pix[-1];
6100             const int p1 = pix[-2];
6101             const int q0 = pix[0];
6102             const int q1 = pix[1];
6103
6104             if( FFABS( p0 - q0 ) < alpha &&
6105                 FFABS( p1 - p0 ) < beta &&
6106                 FFABS( q1 - q0 ) < beta ) {
6107
6108                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6109                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6110                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6111             }
6112         }
6113     }
6114 }
6115
6116 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6117     int i, d;
6118     const int index_a = qp + h->slice_alpha_c0_offset;
6119     const int alpha = (alpha_table+52)[index_a];
6120     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6121     const int pix_next  = stride;
6122
6123     if( bS[0] < 4 ) {
6124         int8_t tc[4];
6125         for(i=0; i<4; i++)
6126             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6127         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6128     } else {
6129         /* 16px edge length, see filter_mb_edgev */
6130             for( d = 0; d < 16; d++ ) {
6131                 const int p0 = pix[-1*pix_next];
6132                 const int p1 = pix[-2*pix_next];
6133                 const int p2 = pix[-3*pix_next];
6134                 const int q0 = pix[0];
6135                 const int q1 = pix[1*pix_next];
6136                 const int q2 = pix[2*pix_next];
6137
6138                 if( FFABS( p0 - q0 ) < alpha &&
6139                     FFABS( p1 - p0 ) < beta &&
6140                     FFABS( q1 - q0 ) < beta ) {
6141
6142                     const int p3 = pix[-4*pix_next];
6143                     const int q3 = pix[ 3*pix_next];
6144
6145                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6146                         if( FFABS( p2 - p0 ) < beta) {
6147                             /* p0', p1', p2' */
6148                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6149                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6150                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6151                         } else {
6152                             /* p0' */
6153                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6154                         }
6155                         if( FFABS( q2 - q0 ) < beta) {
6156                             /* q0', q1', q2' */
6157                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6158                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6159                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6160                         } else {
6161                             /* q0' */
6162                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6163                         }
6164                     }else{
6165                         /* p0', q0' */
6166                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6167                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6168                     }
6169                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6170                 }
6171                 pix++;
6172             }
6173     }
6174 }
6175
6176 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6177     int i;
6178     const int index_a = qp + h->slice_alpha_c0_offset;
6179     const int alpha = (alpha_table+52)[index_a];
6180     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6181
6182     if( bS[0] < 4 ) {
6183         int8_t tc[4];
6184         for(i=0; i<4; i++)
6185             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6186         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6187     } else {
6188         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6189     }
6190 }
6191
6192 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6193     MpegEncContext * const s = &h->s;
6194     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6195     int mb_xy, mb_type;
6196     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6197
6198     mb_xy = h->mb_xy;
6199
6200     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6201 1 ||
6202        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6203                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6204         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6205         return;
6206     }
6207     assert(!FRAME_MBAFF);
6208
6209     mb_type = s->current_picture.mb_type[mb_xy];
6210     qp = s->current_picture.qscale_table[mb_xy];
6211     qp0 = s->current_picture.qscale_table[mb_xy-1];
6212     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6213     qpc = get_chroma_qp( h, 0, qp );
6214     qpc0 = get_chroma_qp( h, 0, qp0 );
6215     qpc1 = get_chroma_qp( h, 0, qp1 );
6216     qp0 = (qp + qp0 + 1) >> 1;
6217     qp1 = (qp + qp1 + 1) >> 1;
6218     qpc0 = (qpc + qpc0 + 1) >> 1;
6219     qpc1 = (qpc + qpc1 + 1) >> 1;
6220     qp_thresh = 15 - h->slice_alpha_c0_offset;
6221     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6222        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6223         return;
6224
6225     if( IS_INTRA(mb_type) ) {
6226         int16_t bS4[4] = {4,4,4,4};
6227         int16_t bS3[4] = {3,3,3,3};
6228         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6229         if( IS_8x8DCT(mb_type) ) {
6230             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6231             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6232             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6233             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6234         } else {
6235             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6236             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6237             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6238             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6239             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6240             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6241             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6242             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6243         }
6244         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6245         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6246         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6247         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6248         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6249         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6250         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6251         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6252         return;
6253     } else {
6254         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6255         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6256         int edges;
6257         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6258             edges = 4;
6259             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6260         } else {
6261             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6262                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6263             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6264                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6265                              ? 3 : 0;
6266             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6267             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6268             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6269                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6270         }
6271         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6272             bSv[0][0] = 0x0004000400040004ULL;
6273         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6274             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6275
6276 #define FILTER(hv,dir,edge)\
6277         if(bSv[dir][edge]) {\
6278             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6279             if(!(edge&1)) {\
6280                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6281                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6282             }\
6283         }
6284         if( edges == 1 ) {
6285             FILTER(v,0,0);
6286             FILTER(h,1,0);
6287         } else if( IS_8x8DCT(mb_type) ) {
6288             FILTER(v,0,0);
6289             FILTER(v,0,2);
6290             FILTER(h,1,0);
6291             FILTER(h,1,2);
6292         } else {
6293             FILTER(v,0,0);
6294             FILTER(v,0,1);
6295             FILTER(v,0,2);
6296             FILTER(v,0,3);
6297             FILTER(h,1,0);
6298             FILTER(h,1,1);
6299             FILTER(h,1,2);
6300             FILTER(h,1,3);
6301         }
6302 #undef FILTER
6303     }
6304 }
6305
6306 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6307     MpegEncContext * const s = &h->s;
6308     const int mb_xy= mb_x + mb_y*s->mb_stride;
6309     const int mb_type = s->current_picture.mb_type[mb_xy];
6310     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6311     int first_vertical_edge_done = 0;
6312     int dir;
6313
6314     //for sufficiently low qp, filtering wouldn't do anything
6315     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6316     if(!FRAME_MBAFF){
6317         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6318         int qp = s->current_picture.qscale_table[mb_xy];
6319         if(qp <= qp_thresh
6320            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6321            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6322             return;
6323         }
6324     }
6325
6326     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6327     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6328         int top_type, left_type[2];
6329         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6330         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6331         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6332
6333         if(IS_8x8DCT(top_type)){
6334             h->non_zero_count_cache[4+8*0]=
6335             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6336             h->non_zero_count_cache[6+8*0]=
6337             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6338         }
6339         if(IS_8x8DCT(left_type[0])){
6340             h->non_zero_count_cache[3+8*1]=
6341             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6342         }
6343         if(IS_8x8DCT(left_type[1])){
6344             h->non_zero_count_cache[3+8*3]=
6345             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6346         }
6347
6348         if(IS_8x8DCT(mb_type)){
6349             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6350             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6351
6352             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6353             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6354
6355             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6356             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6357
6358             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6359             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6360         }
6361     }
6362
6363     if (FRAME_MBAFF
6364             // left mb is in picture
6365             && h->slice_table[mb_xy-1] != 0xFFFF
6366             // and current and left pair do not have the same interlaced type
6367             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6368             // and left mb is in the same slice if deblocking_filter == 2
6369             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6370         /* First vertical edge is different in MBAFF frames
6371          * There are 8 different bS to compute and 2 different Qp
6372          */
6373         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6374         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6375         int16_t bS[8];
6376         int qp[2];
6377         int bqp[2];
6378         int rqp[2];
6379         int mb_qp, mbn0_qp, mbn1_qp;
6380         int i;
6381         first_vertical_edge_done = 1;
6382
6383         if( IS_INTRA(mb_type) )
6384             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6385         else {
6386             for( i = 0; i < 8; i++ ) {
6387                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6388
6389                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6390                     bS[i] = 4;
6391                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6392                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6393                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6394                                                                        :
6395                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6396                     bS[i] = 2;
6397                 else
6398                     bS[i] = 1;
6399             }
6400         }
6401
6402         mb_qp = s->current_picture.qscale_table[mb_xy];
6403         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6404         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6405         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6406         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6407                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6408         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6409                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6410         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6411         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6412                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6413         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6414                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6415
6416         /* Filter edge */
6417         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6418         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6419         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6420         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6421         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6422     }
6423     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6424     for( dir = 0; dir < 2; dir++ )
6425     {
6426         int edge;
6427         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6428         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6429         int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6430         int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6431         int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6432
6433         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6434                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6435         // how often to recheck mv-based bS when iterating between edges
6436         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6437                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6438         // how often to recheck mv-based bS when iterating along each edge
6439         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6440
6441         if (first_vertical_edge_done) {
6442             start = 1;
6443             first_vertical_edge_done = 0;
6444         }
6445
6446         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6447             start = 1;
6448
6449         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6450             && !IS_INTERLACED(mb_type)
6451             && IS_INTERLACED(mbm_type)
6452             ) {
6453             // This is a special case in the norm where the filtering must
6454             // be done twice (one each of the field) even if we are in a
6455             // frame macroblock.
6456             //
6457             static const int nnz_idx[4] = {4,5,6,3};
6458             unsigned int tmp_linesize   = 2 *   linesize;
6459             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6460             int mbn_xy = mb_xy - 2 * s->mb_stride;
6461             int qp;
6462             int i, j;
6463             int16_t bS[4];
6464
6465             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6466                 if( IS_INTRA(mb_type) ||
6467                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6468                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6469                 } else {
6470                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6471                     for( i = 0; i < 4; i++ ) {
6472                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6473                             mbn_nnz[nnz_idx[i]] != 0 )
6474                             bS[i] = 2;
6475                         else
6476                             bS[i] = 1;
6477                     }
6478                 }
6479                 // Do not use s->qscale as luma quantizer because it has not the same
6480                 // value in IPCM macroblocks.
6481                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6482                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6483                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6484                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6485                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6486                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6487                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6488                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6489             }
6490
6491             start = 1;
6492         }
6493
6494         /* Calculate bS */
6495         for( edge = start; edge < edges; edge++ ) {
6496             /* mbn_xy: neighbor macroblock */
6497             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6498             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6499             int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6500             int16_t bS[4];
6501             int qp;
6502
6503             if( (edge&1) && IS_8x8DCT(mb_type) )
6504                 continue;
6505
6506             if( IS_INTRA(mb_type) ||
6507                 IS_INTRA(mbn_type) ) {
6508                 int value;
6509                 if (edge == 0) {
6510                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6511                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6512                     ) {
6513                         value = 4;
6514                     } else {
6515                         value = 3;
6516                     }
6517                 } else {
6518                     value = 3;
6519                 }
6520                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6521             } else {
6522                 int i, l;
6523                 int mv_done;
6524
6525                 if( edge & mask_edge ) {
6526                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6527                     mv_done = 1;
6528                 }
6529                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6530                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6531                     mv_done = 1;
6532                 }
6533                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6534                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6535                     int bn_idx= b_idx - (dir ? 8:1);
6536                     int v = 0;
6537
6538                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6539                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6540                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6541                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6542                     }
6543
6544                     if(h->slice_type_nos == FF_B_TYPE && v){
6545                         v=0;
6546                         for( l = 0; !v && l < 2; l++ ) {
6547                             int ln= 1-l;
6548                             v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6549                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6550                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6551                         }
6552                     }
6553
6554                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6555                     mv_done = 1;
6556                 }
6557                 else
6558                     mv_done = 0;
6559
6560                 for( i = 0; i < 4; i++ ) {
6561                     int x = dir == 0 ? edge : i;
6562                     int y = dir == 0 ? i    : edge;
6563                     int b_idx= 8 + 4 + x + 8*y;
6564                     int bn_idx= b_idx - (dir ? 8:1);
6565
6566                     if( h->non_zero_count_cache[b_idx] != 0 ||
6567                         h->non_zero_count_cache[bn_idx] != 0 ) {
6568                         bS[i] = 2;
6569                     }
6570                     else if(!mv_done)
6571                     {
6572                         bS[i] = 0;
6573                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6574                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6575                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6576                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6577                                 bS[i] = 1;
6578                                 break;
6579                             }
6580                         }
6581
6582                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6583                             bS[i] = 0;
6584                             for( l = 0; l < 2; l++ ) {
6585                                 int ln= 1-l;
6586                                 if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6587                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6588                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6589                                     bS[i] = 1;
6590                                     break;
6591                                 }
6592                             }
6593                         }
6594                     }
6595                 }
6596
6597                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6598                     continue;
6599             }
6600
6601             /* Filter edge */
6602             // Do not use s->qscale as luma quantizer because it has not the same
6603             // value in IPCM macroblocks.
6604             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6605             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6606             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6607             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6608             if( dir == 0 ) {
6609                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6610                 if( (edge&1) == 0 ) {
6611                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6612                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6613                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6614                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6615                 }
6616             } else {
6617                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6618                 if( (edge&1) == 0 ) {
6619                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6620                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6621                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6622                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6623                 }
6624             }
6625         }
6626     }
6627 }
6628
6629 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6630     H264Context *h = *(void**)arg;
6631     MpegEncContext * const s = &h->s;
6632     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6633
6634     s->mb_skip_run= -1;
6635
6636     if( h->pps.cabac ) {
6637         int i;
6638
6639         /* realign */
6640         align_get_bits( &s->gb );
6641
6642         /* init cabac */
6643         ff_init_cabac_states( &h->cabac);
6644         ff_init_cabac_decoder( &h->cabac,
6645                                s->gb.buffer + get_bits_count(&s->gb)/8,
6646                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6647         /* calculate pre-state */
6648         for( i= 0; i < 460; i++ ) {
6649             int pre;
6650             if( h->slice_type_nos == FF_I_TYPE )
6651                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6652             else
6653                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6654
6655             if( pre <= 63 )
6656                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6657             else
6658                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6659         }
6660
6661         for(;;){
6662 //START_TIMER
6663             int ret = decode_mb_cabac(h);
6664             int eos;
6665 //STOP_TIMER("decode_mb_cabac")
6666
6667             if(ret>=0) hl_decode_mb(h);
6668
6669             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6670                 s->mb_y++;
6671
6672                 if(ret>=0) ret = decode_mb_cabac(h);
6673
6674                 if(ret>=0) hl_decode_mb(h);
6675                 s->mb_y--;
6676             }
6677             eos = get_cabac_terminate( &h->cabac );
6678
6679             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6680                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6681                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6682                 return -1;
6683             }
6684
6685             if( ++s->mb_x >= s->mb_width ) {
6686                 s->mb_x = 0;
6687                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6688                 ++s->mb_y;
6689                 if(FIELD_OR_MBAFF_PICTURE) {
6690                     ++s->mb_y;
6691                 }
6692             }
6693
6694             if( eos || s->mb_y >= s->mb_height ) {
6695                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6696                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6697                 return 0;
6698             }
6699         }
6700
6701     } else {
6702         for(;;){
6703             int ret = decode_mb_cavlc(h);
6704
6705             if(ret>=0) hl_decode_mb(h);
6706
6707             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6708                 s->mb_y++;
6709                 ret = decode_mb_cavlc(h);
6710
6711                 if(ret>=0) hl_decode_mb(h);
6712                 s->mb_y--;
6713             }
6714
6715             if(ret<0){
6716                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6717                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6718
6719                 return -1;
6720             }
6721
6722             if(++s->mb_x >= s->mb_width){
6723                 s->mb_x=0;
6724                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6725                 ++s->mb_y;
6726                 if(FIELD_OR_MBAFF_PICTURE) {
6727                     ++s->mb_y;
6728                 }
6729                 if(s->mb_y >= s->mb_height){
6730                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6731
6732                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6733                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6734
6735                         return 0;
6736                     }else{
6737                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6738
6739                         return -1;
6740                     }
6741                 }
6742             }
6743
6744             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6745                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6746                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6747                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6748
6749                     return 0;
6750                 }else{
6751                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6752
6753                     return -1;
6754                 }
6755             }
6756         }
6757     }
6758
6759 #if 0
6760     for(;s->mb_y < s->mb_height; s->mb_y++){
6761         for(;s->mb_x < s->mb_width; s->mb_x++){
6762             int ret= decode_mb(h);
6763
6764             hl_decode_mb(h);
6765
6766             if(ret<0){
6767                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6768                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6769
6770                 return -1;
6771             }
6772
6773             if(++s->mb_x >= s->mb_width){
6774                 s->mb_x=0;
6775                 if(++s->mb_y >= s->mb_height){
6776                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6777                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6778
6779                         return 0;
6780                     }else{
6781                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6782
6783                         return -1;
6784                     }
6785                 }
6786             }
6787
6788             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6789                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6790                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6791
6792                     return 0;
6793                 }else{
6794                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6795
6796                     return -1;
6797                 }
6798             }
6799         }
6800         s->mb_x=0;
6801         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6802     }
6803 #endif
6804     return -1; //not reached
6805 }
6806
6807 static int decode_picture_timing(H264Context *h){
6808     MpegEncContext * const s = &h->s;
6809     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6810         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6811         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6812     }
6813     if(h->sps.pic_struct_present_flag){
6814         unsigned int i, num_clock_ts;
6815         h->sei_pic_struct = get_bits(&s->gb, 4);
6816
6817         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6818             return -1;
6819
6820         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6821
6822         for (i = 0 ; i < num_clock_ts ; i++){
6823             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6824                 unsigned int full_timestamp_flag;
6825                 skip_bits(&s->gb, 2);                 /* ct_type */
6826                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6827                 skip_bits(&s->gb, 5);                 /* counting_type */
6828                 full_timestamp_flag = get_bits(&s->gb, 1);
6829                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6830                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6831                 skip_bits(&s->gb, 8);                 /* n_frames */
6832                 if(full_timestamp_flag){
6833                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6834                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6835                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6836                 }else{
6837                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6838                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6839                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6840                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6841                             if(get_bits(&s->gb, 1))   /* hours_flag */
6842                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6843                         }
6844                     }
6845                 }
6846                 if(h->sps.time_offset_length > 0)
6847                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6848             }
6849         }
6850     }
6851     return 0;
6852 }
6853
6854 static int decode_unregistered_user_data(H264Context *h, int size){
6855     MpegEncContext * const s = &h->s;
6856     uint8_t user_data[16+256];
6857     int e, build, i;
6858
6859     if(size<16)
6860         return -1;
6861
6862     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6863         user_data[i]= get_bits(&s->gb, 8);
6864     }
6865
6866     user_data[i]= 0;
6867     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6868     if(e==1 && build>=0)
6869         h->x264_build= build;
6870
6871     if(s->avctx->debug & FF_DEBUG_BUGS)
6872         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6873
6874     for(; i<size; i++)
6875         skip_bits(&s->gb, 8);
6876
6877     return 0;
6878 }
6879
6880 static int decode_sei(H264Context *h){
6881     MpegEncContext * const s = &h->s;
6882
6883     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6884         int size, type;
6885
6886         type=0;
6887         do{
6888             type+= show_bits(&s->gb, 8);
6889         }while(get_bits(&s->gb, 8) == 255);
6890
6891         size=0;
6892         do{
6893             size+= show_bits(&s->gb, 8);
6894         }while(get_bits(&s->gb, 8) == 255);
6895
6896         switch(type){
6897         case 1: // Picture timing SEI
6898             if(decode_picture_timing(h) < 0)
6899                 return -1;
6900             break;
6901         case 5:
6902             if(decode_unregistered_user_data(h, size) < 0)
6903                 return -1;
6904             break;
6905         default:
6906             skip_bits(&s->gb, 8*size);
6907         }
6908
6909         //FIXME check bits here
6910         align_get_bits(&s->gb);
6911     }
6912
6913     return 0;
6914 }
6915
6916 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6917     MpegEncContext * const s = &h->s;
6918     int cpb_count, i;
6919     cpb_count = get_ue_golomb(&s->gb) + 1;
6920     get_bits(&s->gb, 4); /* bit_rate_scale */
6921     get_bits(&s->gb, 4); /* cpb_size_scale */
6922     for(i=0; i<cpb_count; i++){
6923         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6924         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6925         get_bits1(&s->gb);     /* cbr_flag */
6926     }
6927     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6928     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6929     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6930     sps->time_offset_length = get_bits(&s->gb, 5);
6931 }
6932
6933 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6934     MpegEncContext * const s = &h->s;
6935     int aspect_ratio_info_present_flag;
6936     unsigned int aspect_ratio_idc;
6937
6938     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6939
6940     if( aspect_ratio_info_present_flag ) {
6941         aspect_ratio_idc= get_bits(&s->gb, 8);
6942         if( aspect_ratio_idc == EXTENDED_SAR ) {
6943             sps->sar.num= get_bits(&s->gb, 16);
6944             sps->sar.den= get_bits(&s->gb, 16);
6945         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6946             sps->sar=  pixel_aspect[aspect_ratio_idc];
6947         }else{
6948             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6949             return -1;
6950         }
6951     }else{
6952         sps->sar.num=
6953         sps->sar.den= 0;
6954     }
6955 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6956
6957     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6958         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6959     }
6960
6961     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6962         get_bits(&s->gb, 3);    /* video_format */
6963         get_bits1(&s->gb);      /* video_full_range_flag */
6964         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6965             get_bits(&s->gb, 8); /* colour_primaries */
6966             get_bits(&s->gb, 8); /* transfer_characteristics */
6967             get_bits(&s->gb, 8); /* matrix_coefficients */
6968         }
6969     }
6970
6971     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6972         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6973         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6974     }
6975
6976     sps->timing_info_present_flag = get_bits1(&s->gb);
6977     if(sps->timing_info_present_flag){
6978         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6979         sps->time_scale = get_bits_long(&s->gb, 32);
6980         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6981     }
6982
6983     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6984     if(sps->nal_hrd_parameters_present_flag)
6985         decode_hrd_parameters(h, sps);
6986     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6987     if(sps->vcl_hrd_parameters_present_flag)
6988         decode_hrd_parameters(h, sps);
6989     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6990         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6991     sps->pic_struct_present_flag = get_bits1(&s->gb);
6992
6993     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6994     if(sps->bitstream_restriction_flag){
6995         unsigned int num_reorder_frames;
6996         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6997         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6998         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6999         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7000         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7001         num_reorder_frames= get_ue_golomb(&s->gb);
7002         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7003
7004         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7005             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7006             return -1;
7007         }
7008
7009         sps->num_reorder_frames= num_reorder_frames;
7010     }
7011
7012     return 0;
7013 }
7014
7015 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7016                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7017     MpegEncContext * const s = &h->s;
7018     int i, last = 8, next = 8;
7019     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7020     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7021         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7022     else
7023     for(i=0;i<size;i++){
7024         if(next)
7025             next = (last + get_se_golomb(&s->gb)) & 0xff;
7026         if(!i && !next){ /* matrix not written, we use the preset one */
7027             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7028             break;
7029         }
7030         last = factors[scan[i]] = next ? next : last;
7031     }
7032 }
7033
7034 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7035                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7036     MpegEncContext * const s = &h->s;
7037     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7038     const uint8_t *fallback[4] = {
7039         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7040         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7041         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7042         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7043     };
7044     if(get_bits1(&s->gb)){
7045         sps->scaling_matrix_present |= is_sps;
7046         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7047         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7048         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7049         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7050         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7051         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7052         if(is_sps || pps->transform_8x8_mode){
7053             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7054             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7055         }
7056     }
7057 }
7058
7059 /**
7060  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7061  */
7062 static void *
7063 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7064                     const size_t size, const char *name)
7065 {
7066     if(id>=max) {
7067         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7068         return NULL;
7069     }
7070
7071     if(!vec[id]) {
7072         vec[id] = av_mallocz(size);
7073         if(vec[id] == NULL)
7074             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7075     }
7076     return vec[id];
7077 }
7078
7079 static inline int decode_seq_parameter_set(H264Context *h){
7080     MpegEncContext * const s = &h->s;
7081     int profile_idc, level_idc;
7082     unsigned int sps_id, tmp, mb_width, mb_height;
7083     int i;
7084     SPS *sps;
7085
7086     profile_idc= get_bits(&s->gb, 8);
7087     get_bits1(&s->gb);   //constraint_set0_flag
7088     get_bits1(&s->gb);   //constraint_set1_flag
7089     get_bits1(&s->gb);   //constraint_set2_flag
7090     get_bits1(&s->gb);   //constraint_set3_flag
7091     get_bits(&s->gb, 4); // reserved
7092     level_idc= get_bits(&s->gb, 8);
7093     sps_id= get_ue_golomb(&s->gb);
7094
7095     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7096     if(sps == NULL)
7097         return -1;
7098
7099     sps->profile_idc= profile_idc;
7100     sps->level_idc= level_idc;
7101
7102     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7103     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7104     sps->scaling_matrix_present = 0;
7105
7106     if(sps->profile_idc >= 100){ //high profile
7107         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7108         if(sps->chroma_format_idc == 3)
7109             get_bits1(&s->gb);  //residual_color_transform_flag
7110         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7111         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7112         sps->transform_bypass = get_bits1(&s->gb);
7113         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7114     }else{
7115         sps->chroma_format_idc= 1;
7116     }
7117
7118     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7119     sps->poc_type= get_ue_golomb(&s->gb);
7120
7121     if(sps->poc_type == 0){ //FIXME #define
7122         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7123     } else if(sps->poc_type == 1){//FIXME #define
7124         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7125         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7126         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7127         tmp= get_ue_golomb(&s->gb);
7128
7129         if(tmp >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7130             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7131             return -1;
7132         }
7133         sps->poc_cycle_length= tmp;
7134
7135         for(i=0; i<sps->poc_cycle_length; i++)
7136             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7137     }else if(sps->poc_type != 2){
7138         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7139         return -1;
7140     }
7141
7142     tmp= get_ue_golomb(&s->gb);
7143     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7144         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7145         return -1;
7146     }
7147     sps->ref_frame_count= tmp;
7148     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7149     mb_width= get_ue_golomb(&s->gb) + 1;
7150     mb_height= get_ue_golomb(&s->gb) + 1;
7151     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7152        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7153         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7154         return -1;
7155     }
7156     sps->mb_width = mb_width;
7157     sps->mb_height= mb_height;
7158
7159     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7160     if(!sps->frame_mbs_only_flag)
7161         sps->mb_aff= get_bits1(&s->gb);
7162     else
7163         sps->mb_aff= 0;
7164
7165     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7166
7167 #ifndef ALLOW_INTERLACE
7168     if(sps->mb_aff)
7169         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7170 #endif
7171     sps->crop= get_bits1(&s->gb);
7172     if(sps->crop){
7173         sps->crop_left  = get_ue_golomb(&s->gb);
7174         sps->crop_right = get_ue_golomb(&s->gb);
7175         sps->crop_top   = get_ue_golomb(&s->gb);
7176         sps->crop_bottom= get_ue_golomb(&s->gb);
7177         if(sps->crop_left || sps->crop_top){
7178             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7179         }
7180         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7181             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7182         }
7183     }else{
7184         sps->crop_left  =
7185         sps->crop_right =
7186         sps->crop_top   =
7187         sps->crop_bottom= 0;
7188     }
7189
7190     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7191     if( sps->vui_parameters_present_flag )
7192         decode_vui_parameters(h, sps);
7193
7194     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7195         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7196                sps_id, sps->profile_idc, sps->level_idc,
7197                sps->poc_type,
7198                sps->ref_frame_count,
7199                sps->mb_width, sps->mb_height,
7200                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7201                sps->direct_8x8_inference_flag ? "8B8" : "",
7202                sps->crop_left, sps->crop_right,
7203                sps->crop_top, sps->crop_bottom,
7204                sps->vui_parameters_present_flag ? "VUI" : "",
7205                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7206                );
7207     }
7208     return 0;
7209 }
7210
7211 static void
7212 build_qp_table(PPS *pps, int t, int index)
7213 {
7214     int i;
7215     for(i = 0; i < 52; i++)
7216         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7217 }
7218
7219 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7220     MpegEncContext * const s = &h->s;
7221     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7222     PPS *pps;
7223
7224     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7225     if(pps == NULL)
7226         return -1;
7227
7228     tmp= get_ue_golomb(&s->gb);
7229     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7230         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7231         return -1;
7232     }
7233     pps->sps_id= tmp;
7234
7235     pps->cabac= get_bits1(&s->gb);
7236     pps->pic_order_present= get_bits1(&s->gb);
7237     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7238     if(pps->slice_group_count > 1 ){
7239         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7240         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7241         switch(pps->mb_slice_group_map_type){
7242         case 0:
7243 #if 0
7244 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7245 |    run_length[ i ]                                |1  |ue(v)   |
7246 #endif
7247             break;
7248         case 2:
7249 #if 0
7250 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7251 |{                                                  |   |        |
7252 |    top_left_mb[ i ]                               |1  |ue(v)   |
7253 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7254 |   }                                               |   |        |
7255 #endif
7256             break;
7257         case 3:
7258         case 4:
7259         case 5:
7260 #if 0
7261 |   slice_group_change_direction_flag               |1  |u(1)    |
7262 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7263 #endif
7264             break;
7265         case 6:
7266 #if 0
7267 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7268 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7269 |)                                                  |   |        |
7270 |    slice_group_id[ i ]                            |1  |u(v)    |
7271 #endif
7272             break;
7273         }
7274     }
7275     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7276     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7277     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7278         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7279         pps->ref_count[0]= pps->ref_count[1]= 1;
7280         return -1;
7281     }
7282
7283     pps->weighted_pred= get_bits1(&s->gb);
7284     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7285     pps->init_qp= get_se_golomb(&s->gb) + 26;
7286     pps->init_qs= get_se_golomb(&s->gb) + 26;
7287     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7288     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7289     pps->constrained_intra_pred= get_bits1(&s->gb);
7290     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7291
7292     pps->transform_8x8_mode= 0;
7293     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7294     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7295     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7296
7297     if(get_bits_count(&s->gb) < bit_length){
7298         pps->transform_8x8_mode= get_bits1(&s->gb);
7299         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7300         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7301     } else {
7302         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7303     }
7304
7305     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7306     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7307     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7308         h->pps.chroma_qp_diff= 1;
7309
7310     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7311         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7312                pps_id, pps->sps_id,
7313                pps->cabac ? "CABAC" : "CAVLC",
7314                pps->slice_group_count,
7315                pps->ref_count[0], pps->ref_count[1],
7316                pps->weighted_pred ? "weighted" : "",
7317                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7318                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7319                pps->constrained_intra_pred ? "CONSTR" : "",
7320                pps->redundant_pic_cnt_present ? "REDU" : "",
7321                pps->transform_8x8_mode ? "8x8DCT" : ""
7322                );
7323     }
7324
7325     return 0;
7326 }
7327
7328 /**
7329  * Call decode_slice() for each context.
7330  *
7331  * @param h h264 master context
7332  * @param context_count number of contexts to execute
7333  */
7334 static void execute_decode_slices(H264Context *h, int context_count){
7335     MpegEncContext * const s = &h->s;
7336     AVCodecContext * const avctx= s->avctx;
7337     H264Context *hx;
7338     int i;
7339
7340     if(context_count == 1) {
7341         decode_slice(avctx, &h);
7342     } else {
7343         for(i = 1; i < context_count; i++) {
7344             hx = h->thread_context[i];
7345             hx->s.error_recognition = avctx->error_recognition;
7346             hx->s.error_count = 0;
7347         }
7348
7349         avctx->execute(avctx, (void *)decode_slice,
7350                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7351
7352         /* pull back stuff from slices to master context */
7353         hx = h->thread_context[context_count - 1];
7354         s->mb_x = hx->s.mb_x;
7355         s->mb_y = hx->s.mb_y;
7356         s->dropable = hx->s.dropable;
7357         s->picture_structure = hx->s.picture_structure;
7358         for(i = 1; i < context_count; i++)
7359             h->s.error_count += h->thread_context[i]->s.error_count;
7360     }
7361 }
7362
7363
7364 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7365     MpegEncContext * const s = &h->s;
7366     AVCodecContext * const avctx= s->avctx;
7367     int buf_index=0;
7368     H264Context *hx; ///< thread context
7369     int context_count = 0;
7370
7371     h->max_contexts = avctx->thread_count;
7372 #if 0
7373     int i;
7374     for(i=0; i<50; i++){
7375         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7376     }
7377 #endif
7378     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7379         h->current_slice = 0;
7380         if (!s->first_field)
7381             s->current_picture_ptr= NULL;
7382     }
7383
7384     for(;;){
7385         int consumed;
7386         int dst_length;
7387         int bit_length;
7388         const uint8_t *ptr;
7389         int i, nalsize = 0;
7390         int err;
7391
7392         if(h->is_avc) {
7393             if(buf_index >= buf_size) break;
7394             nalsize = 0;
7395             for(i = 0; i < h->nal_length_size; i++)
7396                 nalsize = (nalsize << 8) | buf[buf_index++];
7397             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7398                 if(nalsize == 1){
7399                     buf_index++;
7400                     continue;
7401                 }else{
7402                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7403                     break;
7404                 }
7405             }
7406         } else {
7407             // start code prefix search
7408             for(; buf_index + 3 < buf_size; buf_index++){
7409                 // This should always succeed in the first iteration.
7410                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7411                     break;
7412             }
7413
7414             if(buf_index+3 >= buf_size) break;
7415
7416             buf_index+=3;
7417         }
7418
7419         hx = h->thread_context[context_count];
7420
7421         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7422         if (ptr==NULL || dst_length < 0){
7423             return -1;
7424         }
7425         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7426             dst_length--;
7427         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7428
7429         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7430             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7431         }
7432
7433         if (h->is_avc && (nalsize != consumed)){
7434             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7435             consumed= nalsize;
7436         }
7437
7438         buf_index += consumed;
7439
7440         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7441            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7442             continue;
7443
7444       again:
7445         err = 0;
7446         switch(hx->nal_unit_type){
7447         case NAL_IDR_SLICE:
7448             if (h->nal_unit_type != NAL_IDR_SLICE) {
7449                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7450                 return -1;
7451             }
7452             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7453         case NAL_SLICE:
7454             init_get_bits(&hx->s.gb, ptr, bit_length);
7455             hx->intra_gb_ptr=
7456             hx->inter_gb_ptr= &hx->s.gb;
7457             hx->s.data_partitioning = 0;
7458
7459             if((err = decode_slice_header(hx, h)))
7460                break;
7461
7462             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7463             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7464                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7465                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7466                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7467                && avctx->skip_frame < AVDISCARD_ALL)
7468                 context_count++;
7469             break;
7470         case NAL_DPA:
7471             init_get_bits(&hx->s.gb, ptr, bit_length);
7472             hx->intra_gb_ptr=
7473             hx->inter_gb_ptr= NULL;
7474             hx->s.data_partitioning = 1;
7475
7476             err = decode_slice_header(hx, h);
7477             break;
7478         case NAL_DPB:
7479             init_get_bits(&hx->intra_gb, ptr, bit_length);
7480             hx->intra_gb_ptr= &hx->intra_gb;
7481             break;
7482         case NAL_DPC:
7483             init_get_bits(&hx->inter_gb, ptr, bit_length);
7484             hx->inter_gb_ptr= &hx->inter_gb;
7485
7486             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7487                && s->context_initialized
7488                && s->hurry_up < 5
7489                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7490                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7491                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7492                && avctx->skip_frame < AVDISCARD_ALL)
7493                 context_count++;
7494             break;
7495         case NAL_SEI:
7496             init_get_bits(&s->gb, ptr, bit_length);
7497             decode_sei(h);
7498             break;
7499         case NAL_SPS:
7500             init_get_bits(&s->gb, ptr, bit_length);
7501             decode_seq_parameter_set(h);
7502
7503             if(s->flags& CODEC_FLAG_LOW_DELAY)
7504                 s->low_delay=1;
7505
7506             if(avctx->has_b_frames < 2)
7507                 avctx->has_b_frames= !s->low_delay;
7508             break;
7509         case NAL_PPS:
7510             init_get_bits(&s->gb, ptr, bit_length);
7511
7512             decode_picture_parameter_set(h, bit_length);
7513
7514             break;
7515         case NAL_AUD:
7516         case NAL_END_SEQUENCE:
7517         case NAL_END_STREAM:
7518         case NAL_FILLER_DATA:
7519         case NAL_SPS_EXT:
7520         case NAL_AUXILIARY_SLICE:
7521             break;
7522         default:
7523             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7524         }
7525
7526         if(context_count == h->max_contexts) {
7527             execute_decode_slices(h, context_count);
7528             context_count = 0;
7529         }
7530
7531         if (err < 0)
7532             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7533         else if(err == 1) {
7534             /* Slice could not be decoded in parallel mode, copy down
7535              * NAL unit stuff to context 0 and restart. Note that
7536              * rbsp_buffer is not transferred, but since we no longer
7537              * run in parallel mode this should not be an issue. */
7538             h->nal_unit_type = hx->nal_unit_type;
7539             h->nal_ref_idc   = hx->nal_ref_idc;
7540             hx = h;
7541             goto again;
7542         }
7543     }
7544     if(context_count)
7545         execute_decode_slices(h, context_count);
7546     return buf_index;
7547 }
7548
7549 /**
7550  * returns the number of bytes consumed for building the current frame
7551  */
7552 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7553         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7554         if(pos+10>buf_size) pos=buf_size; // oops ;)
7555
7556         return pos;
7557 }
7558
7559 static int decode_frame(AVCodecContext *avctx,
7560                              void *data, int *data_size,
7561                              const uint8_t *buf, int buf_size)
7562 {
7563     H264Context *h = avctx->priv_data;
7564     MpegEncContext *s = &h->s;
7565     AVFrame *pict = data;
7566     int buf_index;
7567
7568     s->flags= avctx->flags;
7569     s->flags2= avctx->flags2;
7570
7571    /* end of stream, output what is still in the buffers */
7572     if (buf_size == 0) {
7573         Picture *out;
7574         int i, out_idx;
7575
7576 //FIXME factorize this with the output code below
7577         out = h->delayed_pic[0];
7578         out_idx = 0;
7579         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7580             if(h->delayed_pic[i]->poc < out->poc){
7581                 out = h->delayed_pic[i];
7582                 out_idx = i;
7583             }
7584
7585         for(i=out_idx; h->delayed_pic[i]; i++)
7586             h->delayed_pic[i] = h->delayed_pic[i+1];
7587
7588         if(out){
7589             *data_size = sizeof(AVFrame);
7590             *pict= *(AVFrame*)out;
7591         }
7592
7593         return 0;
7594     }
7595
7596     if(h->is_avc && !h->got_avcC) {
7597         int i, cnt, nalsize;
7598         unsigned char *p = avctx->extradata;
7599         if(avctx->extradata_size < 7) {
7600             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7601             return -1;
7602         }
7603         if(*p != 1) {
7604             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7605             return -1;
7606         }
7607         /* sps and pps in the avcC always have length coded with 2 bytes,
7608            so put a fake nal_length_size = 2 while parsing them */
7609         h->nal_length_size = 2;
7610         // Decode sps from avcC
7611         cnt = *(p+5) & 0x1f; // Number of sps
7612         p += 6;
7613         for (i = 0; i < cnt; i++) {
7614             nalsize = AV_RB16(p) + 2;
7615             if(decode_nal_units(h, p, nalsize) < 0) {
7616                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7617                 return -1;
7618             }
7619             p += nalsize;
7620         }
7621         // Decode pps from avcC
7622         cnt = *(p++); // Number of pps
7623         for (i = 0; i < cnt; i++) {
7624             nalsize = AV_RB16(p) + 2;
7625             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7626                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7627                 return -1;
7628             }
7629             p += nalsize;
7630         }
7631         // Now store right nal length size, that will be use to parse all other nals
7632         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7633         // Do not reparse avcC
7634         h->got_avcC = 1;
7635     }
7636
7637     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7638         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7639             return -1;
7640         h->got_avcC = 1;
7641     }
7642
7643     buf_index=decode_nal_units(h, buf, buf_size);
7644     if(buf_index < 0)
7645         return -1;
7646
7647     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7648         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7649         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7650         return -1;
7651     }
7652
7653     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7654         Picture *out = s->current_picture_ptr;
7655         Picture *cur = s->current_picture_ptr;
7656         int i, pics, cross_idr, out_of_order, out_idx;
7657
7658         s->mb_y= 0;
7659
7660         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7661         s->current_picture_ptr->pict_type= s->pict_type;
7662
7663         if(!s->dropable) {
7664             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7665             h->prev_poc_msb= h->poc_msb;
7666             h->prev_poc_lsb= h->poc_lsb;
7667         }
7668         h->prev_frame_num_offset= h->frame_num_offset;
7669         h->prev_frame_num= h->frame_num;
7670
7671         /*
7672          * FIXME: Error handling code does not seem to support interlaced
7673          * when slices span multiple rows
7674          * The ff_er_add_slice calls don't work right for bottom
7675          * fields; they cause massive erroneous error concealing
7676          * Error marking covers both fields (top and bottom).
7677          * This causes a mismatched s->error_count
7678          * and a bad error table. Further, the error count goes to
7679          * INT_MAX when called for bottom field, because mb_y is
7680          * past end by one (callers fault) and resync_mb_y != 0
7681          * causes problems for the first MB line, too.
7682          */
7683         if (!FIELD_PICTURE)
7684             ff_er_frame_end(s);
7685
7686         MPV_frame_end(s);
7687
7688         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7689             /* Wait for second field. */
7690             *data_size = 0;
7691
7692         } else {
7693             cur->repeat_pict = 0;
7694
7695             /* Signal interlacing information externally. */
7696             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7697             if(h->sps.pic_struct_present_flag){
7698                 switch (h->sei_pic_struct)
7699                 {
7700                 case SEI_PIC_STRUCT_FRAME:
7701                     cur->interlaced_frame = 0;
7702                     break;
7703                 case SEI_PIC_STRUCT_TOP_FIELD:
7704                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7705                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7706                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7707                     cur->interlaced_frame = 1;
7708                     break;
7709                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7710                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7711                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7712                     // From these hints, let the applications decide if they apply deinterlacing.
7713                     cur->repeat_pict = 1;
7714                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7715                     break;
7716                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7717                     // Force progressive here, as doubling interlaced frame is a bad idea.
7718                     cur->interlaced_frame = 0;
7719                     cur->repeat_pict = 2;
7720                     break;
7721                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7722                     cur->interlaced_frame = 0;
7723                     cur->repeat_pict = 4;
7724                     break;
7725                 }
7726             }else{
7727                 /* Derive interlacing flag from used decoding process. */
7728                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7729             }
7730
7731             if (cur->field_poc[0] != cur->field_poc[1]){
7732                 /* Derive top_field_first from field pocs. */
7733                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7734             }else{
7735                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7736                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7737                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7738                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7739                         cur->top_field_first = 1;
7740                     else
7741                         cur->top_field_first = 0;
7742                 }else{
7743                     /* Most likely progressive */
7744                     cur->top_field_first = 0;
7745                 }
7746             }
7747
7748         //FIXME do something with unavailable reference frames
7749
7750             /* Sort B-frames into display order */
7751
7752             if(h->sps.bitstream_restriction_flag
7753                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7754                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7755                 s->low_delay = 0;
7756             }
7757
7758             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7759                && !h->sps.bitstream_restriction_flag){
7760                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7761                 s->low_delay= 0;
7762             }
7763
7764             pics = 0;
7765             while(h->delayed_pic[pics]) pics++;
7766
7767             assert(pics <= MAX_DELAYED_PIC_COUNT);
7768
7769             h->delayed_pic[pics++] = cur;
7770             if(cur->reference == 0)
7771                 cur->reference = DELAYED_PIC_REF;
7772
7773             out = h->delayed_pic[0];
7774             out_idx = 0;
7775             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7776                 if(h->delayed_pic[i]->poc < out->poc){
7777                     out = h->delayed_pic[i];
7778                     out_idx = i;
7779                 }
7780             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7781
7782             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7783
7784             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7785                 { }
7786             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7787                || (s->low_delay &&
7788                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7789                  || cur->pict_type == FF_B_TYPE)))
7790             {
7791                 s->low_delay = 0;
7792                 s->avctx->has_b_frames++;
7793             }
7794
7795             if(out_of_order || pics > s->avctx->has_b_frames){
7796                 out->reference &= ~DELAYED_PIC_REF;
7797                 for(i=out_idx; h->delayed_pic[i]; i++)
7798                     h->delayed_pic[i] = h->delayed_pic[i+1];
7799             }
7800             if(!out_of_order && pics > s->avctx->has_b_frames){
7801                 *data_size = sizeof(AVFrame);
7802
7803                 h->outputed_poc = out->poc;
7804                 *pict= *(AVFrame*)out;
7805             }else{
7806                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7807             }
7808         }
7809     }
7810
7811     assert(pict->data[0] || !*data_size);
7812     ff_print_debug_info(s, pict);
7813 //printf("out %d\n", (int)pict->data[0]);
7814 #if 0 //?
7815
7816     /* Return the Picture timestamp as the frame number */
7817     /* we subtract 1 because it is added on utils.c     */
7818     avctx->frame_number = s->picture_number - 1;
7819 #endif
7820     return get_consumed_bytes(s, buf_index, buf_size);
7821 }
7822 #if 0
7823 static inline void fill_mb_avail(H264Context *h){
7824     MpegEncContext * const s = &h->s;
7825     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7826
7827     if(s->mb_y){
7828         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7829         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7830         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7831     }else{
7832         h->mb_avail[0]=
7833         h->mb_avail[1]=
7834         h->mb_avail[2]= 0;
7835     }
7836     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7837     h->mb_avail[4]= 1; //FIXME move out
7838     h->mb_avail[5]= 0; //FIXME move out
7839 }
7840 #endif
7841
7842 #ifdef TEST
7843 #undef printf
7844 #undef random
7845 #define COUNT 8000
7846 #define SIZE (COUNT*40)
7847 int main(void){
7848     int i;
7849     uint8_t temp[SIZE];
7850     PutBitContext pb;
7851     GetBitContext gb;
7852 //    int int_temp[10000];
7853     DSPContext dsp;
7854     AVCodecContext avctx;
7855
7856     dsputil_init(&dsp, &avctx);
7857
7858     init_put_bits(&pb, temp, SIZE);
7859     printf("testing unsigned exp golomb\n");
7860     for(i=0; i<COUNT; i++){
7861         START_TIMER
7862         set_ue_golomb(&pb, i);
7863         STOP_TIMER("set_ue_golomb");
7864     }
7865     flush_put_bits(&pb);
7866
7867     init_get_bits(&gb, temp, 8*SIZE);
7868     for(i=0; i<COUNT; i++){
7869         int j, s;
7870
7871         s= show_bits(&gb, 24);
7872
7873         START_TIMER
7874         j= get_ue_golomb(&gb);
7875         if(j != i){
7876             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7877 //            return -1;
7878         }
7879         STOP_TIMER("get_ue_golomb");
7880     }
7881
7882
7883     init_put_bits(&pb, temp, SIZE);
7884     printf("testing signed exp golomb\n");
7885     for(i=0; i<COUNT; i++){
7886         START_TIMER
7887         set_se_golomb(&pb, i - COUNT/2);
7888         STOP_TIMER("set_se_golomb");
7889     }
7890     flush_put_bits(&pb);
7891
7892     init_get_bits(&gb, temp, 8*SIZE);
7893     for(i=0; i<COUNT; i++){
7894         int j, s;
7895
7896         s= show_bits(&gb, 24);
7897
7898         START_TIMER
7899         j= get_se_golomb(&gb);
7900         if(j != i - COUNT/2){
7901             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7902 //            return -1;
7903         }
7904         STOP_TIMER("get_se_golomb");
7905     }
7906
7907 #if 0
7908     printf("testing 4x4 (I)DCT\n");
7909
7910     DCTELEM block[16];
7911     uint8_t src[16], ref[16];
7912     uint64_t error= 0, max_error=0;
7913
7914     for(i=0; i<COUNT; i++){
7915         int j;
7916 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7917         for(j=0; j<16; j++){
7918             ref[j]= random()%255;
7919             src[j]= random()%255;
7920         }
7921
7922         h264_diff_dct_c(block, src, ref, 4);
7923
7924         //normalize
7925         for(j=0; j<16; j++){
7926 //            printf("%d ", block[j]);
7927             block[j]= block[j]*4;
7928             if(j&1) block[j]= (block[j]*4 + 2)/5;
7929             if(j&4) block[j]= (block[j]*4 + 2)/5;
7930         }
7931 //        printf("\n");
7932
7933         s->dsp.h264_idct_add(ref, block, 4);
7934 /*        for(j=0; j<16; j++){
7935             printf("%d ", ref[j]);
7936         }
7937         printf("\n");*/
7938
7939         for(j=0; j<16; j++){
7940             int diff= FFABS(src[j] - ref[j]);
7941
7942             error+= diff*diff;
7943             max_error= FFMAX(max_error, diff);
7944         }
7945     }
7946     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7947     printf("testing quantizer\n");
7948     for(qp=0; qp<52; qp++){
7949         for(i=0; i<16; i++)
7950             src1_block[i]= src2_block[i]= random()%255;
7951
7952     }
7953     printf("Testing NAL layer\n");
7954
7955     uint8_t bitstream[COUNT];
7956     uint8_t nal[COUNT*2];
7957     H264Context h;
7958     memset(&h, 0, sizeof(H264Context));
7959
7960     for(i=0; i<COUNT; i++){
7961         int zeros= i;
7962         int nal_length;
7963         int consumed;
7964         int out_length;
7965         uint8_t *out;
7966         int j;
7967
7968         for(j=0; j<COUNT; j++){
7969             bitstream[j]= (random() % 255) + 1;
7970         }
7971
7972         for(j=0; j<zeros; j++){
7973             int pos= random() % COUNT;
7974             while(bitstream[pos] == 0){
7975                 pos++;
7976                 pos %= COUNT;
7977             }
7978             bitstream[pos]=0;
7979         }
7980
7981         START_TIMER
7982
7983         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7984         if(nal_length<0){
7985             printf("encoding failed\n");
7986             return -1;
7987         }
7988
7989         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7990
7991         STOP_TIMER("NAL")
7992
7993         if(out_length != COUNT){
7994             printf("incorrect length %d %d\n", out_length, COUNT);
7995             return -1;
7996         }
7997
7998         if(consumed != nal_length){
7999             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8000             return -1;
8001         }
8002
8003         if(memcmp(bitstream, out, COUNT)){
8004             printf("mismatch\n");
8005             return -1;
8006         }
8007     }
8008 #endif
8009
8010     printf("Testing RBSP\n");
8011
8012
8013     return 0;
8014 }
8015 #endif /* TEST */
8016
8017
8018 static av_cold int decode_end(AVCodecContext *avctx)
8019 {
8020     H264Context *h = avctx->priv_data;
8021     MpegEncContext *s = &h->s;
8022     int i;
8023
8024     av_freep(&h->rbsp_buffer[0]);
8025     av_freep(&h->rbsp_buffer[1]);
8026     free_tables(h); //FIXME cleanup init stuff perhaps
8027
8028     for(i = 0; i < MAX_SPS_COUNT; i++)
8029         av_freep(h->sps_buffers + i);
8030
8031     for(i = 0; i < MAX_PPS_COUNT; i++)
8032         av_freep(h->pps_buffers + i);
8033
8034     MPV_common_end(s);
8035
8036 //    memset(h, 0, sizeof(H264Context));
8037
8038     return 0;
8039 }
8040
8041
8042 AVCodec h264_decoder = {
8043     "h264",
8044     CODEC_TYPE_VIDEO,
8045     CODEC_ID_H264,
8046     sizeof(H264Context),
8047     decode_init,
8048     NULL,
8049     decode_end,
8050     decode_frame,
8051     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8052     .flush= flush_dpb,
8053     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8054 };
8055
8056 #include "svq3.c"