git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 const uint8_t ff_rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 const uint8_t ff_div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 188             for(i=0; i<16; i++)
 189                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 190             for(list=0; list<h->list_count; list++){
 191                 if(USES_LIST(mb_type,list)){
 192                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 193                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 194                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 195                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 196                         dst[0] = src[0];
 197                         dst[1] = src[1];
 198                         dst[2] = src[2];
 199                         dst[3] = src[3];
 200                     }
 201                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 202                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 203                     ref += h->b8_stride;
 204                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 205                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 206                 }else{
 207                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 208                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 209                 }
 210             }
 211         }
 212     }else{
 213         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 214         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 215         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 216         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 217         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 218     }
 219
 220     if(IS_INTRA(mb_type)){
 221         h->topleft_samples_available=
 222         h->top_samples_available=
 223         h->left_samples_available= 0xFFFF;
 224         h->topright_samples_available= 0xEEEA;
 225
 226         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 227             h->topleft_samples_available= 0xB3FF;
 228             h->top_samples_available= 0x33FF;
 229             h->topright_samples_available= 0x26EA;
 230         }
 231         for(i=0; i<2; i++){
 232             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 233                 h->topleft_samples_available&= 0xDF5F;
 234                 h->left_samples_available&= 0x5F5F;
 235             }
 236         }
 237
 238         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 239             h->topleft_samples_available&= 0x7FFF;
 240
 241         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 242             h->topright_samples_available&= 0xFBFF;
 243
 244         if(IS_INTRA4x4(mb_type)){
 245             if(IS_INTRA4x4(top_type)){
 246                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 247                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 248                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 249                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 250             }else{
 251                 int pred;
 252                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 253                     pred= -1;
 254                 else{
 255                     pred= 2;
 256                 }
 257                 h->intra4x4_pred_mode_cache[4+8*0]=
 258                 h->intra4x4_pred_mode_cache[5+8*0]=
 259                 h->intra4x4_pred_mode_cache[6+8*0]=
 260                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 261             }
 262             for(i=0; i<2; i++){
 263                 if(IS_INTRA4x4(left_type[i])){
 264                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 265                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 266                 }else{
 267                     int pred;
 268                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 269                         pred= -1;
 270                     else{
 271                         pred= 2;
 272                     }
 273                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 274                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 275                 }
 276             }
 277         }
 278     }
 279
 280
 281 /*
 282 0 . T T. T T T T
 283 1 L . .L . . . .
 284 2 L . .L . . . .
 285 3 . T TL . . . .
 286 4 L . .L . . . .
 287 5 L . .. . . . .
 288 */
 289 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 290     if(top_type){
 291         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 292         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 293         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 294         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 295
 296         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 297         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 298
 299         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 300         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 301
 302     }else{
 303         h->non_zero_count_cache[4+8*0]=
 304         h->non_zero_count_cache[5+8*0]=
 305         h->non_zero_count_cache[6+8*0]=
 306         h->non_zero_count_cache[7+8*0]=
 307
 308         h->non_zero_count_cache[1+8*0]=
 309         h->non_zero_count_cache[2+8*0]=
 310
 311         h->non_zero_count_cache[1+8*3]=
 312         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 313
 314     }
 315
 316     for (i=0; i<2; i++) {
 317         if(left_type[i]){
 318             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 319             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 320             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 321             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 322         }else{
 323             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 324             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 325             h->non_zero_count_cache[0+8*1 +   8*i]=
 326             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 327         }
 328     }
 329
 330     if( h->pps.cabac ) {
 331         // top_cbp
 332         if(top_type) {
 333             h->top_cbp = h->cbp_table[top_xy];
 334         } else if(IS_INTRA(mb_type)) {
 335             h->top_cbp = 0x1C0;
 336         } else {
 337             h->top_cbp = 0;
 338         }
 339         // left_cbp
 340         if (left_type[0]) {
 341             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 342         } else if(IS_INTRA(mb_type)) {
 343             h->left_cbp = 0x1C0;
 344         } else {
 345             h->left_cbp = 0;
 346         }
 347         if (left_type[0]) {
 348             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 349         }
 350         if (left_type[1]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 352         }
 353     }
 354
 355 #if 1
 356     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 357         int list;
 358         for(list=0; list<h->list_count; list++){
 359             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 360                 /*if(!h->mv_cache_clean[list]){
 361                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 362                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 363                     h->mv_cache_clean[list]= 1;
 364                 }*/
 365                 continue;
 366             }
 367             h->mv_cache_clean[list]= 0;
 368
 369             if(USES_LIST(top_type, list)){
 370                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 371                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 372                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 373                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 376                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 377                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 378                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 379                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 380             }else{
 381                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 382                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 385                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 386             }
 387
 388             for(i=0; i<2; i++){
 389                 int cache_idx = scan8[0] - 1 + i*2*8;
 390                 if(USES_LIST(left_type[i], list)){
 391                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 392                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 393                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 394                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 395                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 396                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 397                 }else{
 398                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 399                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 400                     h->ref_cache[list][cache_idx  ]=
 401                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 402                 }
 403             }
 404
 405             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 406                 continue;
 407
 408             if(USES_LIST(topleft_type, list)){
 409                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 410                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 411                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 412                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 413             }else{
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 416             }
 417
 418             if(USES_LIST(topright_type, list)){
 419                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 420                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 421                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 422                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 423             }else{
 424                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 426             }
 427
 428             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 429                 continue;
 430
 431             h->ref_cache[list][scan8[5 ]+1] =
 432             h->ref_cache[list][scan8[7 ]+1] =
 433             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 434             h->ref_cache[list][scan8[4 ]] =
 435             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 436             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 437             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 438             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 439             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 440             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 441
 442             if( h->pps.cabac ) {
 443                 /* XXX beurk, Load mvd */
 444                 if(USES_LIST(top_type, list)){
 445                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 446                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 447                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 450                 }else{
 451                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 452                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 455                 }
 456                 if(USES_LIST(left_type[0], list)){
 457                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 459                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 460                 }else{
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 463                 }
 464                 if(USES_LIST(left_type[1], list)){
 465                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 466                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 468                 }else{
 469                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 471                 }
 472                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 473                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 474                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 475                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 477
 478                 if(h->slice_type_nos == FF_B_TYPE){
 479                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 480
 481                     if(IS_DIRECT(top_type)){
 482                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 483                     }else if(IS_8X8(top_type)){
 484                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 485                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 486                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 487                     }else{
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 489                     }
 490
 491                     if(IS_DIRECT(left_type[0]))
 492                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 493                     else if(IS_8X8(left_type[0]))
 494                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 495                     else
 496                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 497
 498                     if(IS_DIRECT(left_type[1]))
 499                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 500                     else if(IS_8X8(left_type[1]))
 501                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 502                     else
 503                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 504                 }
 505             }
 506
 507             if(FRAME_MBAFF){
 508 #define MAP_MVS\
 509                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 510                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 511                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 512                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 515                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 516                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 517                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 518                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 519                 if(MB_FIELD){
 520 #define MAP_F2F(idx, mb_type)\
 521                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 522                         h->ref_cache[list][idx] <<= 1;\
 523                         h->mv_cache[list][idx][1] /= 2;\
 524                         h->mvd_cache[list][idx][1] /= 2;\
 525                     }
 526                     MAP_MVS
 527 #undef MAP_F2F
 528                 }else{
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] >>= 1;\
 532                         h->mv_cache[list][idx][1] <<= 1;\
 533                         h->mvd_cache[list][idx][1] <<= 1;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }
 538             }
 539         }
 540     }
 541 #endif
 542
 543     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 544 }
 545
 546 static inline void write_back_intra_pred_mode(H264Context *h){
 547     const int mb_xy= h->mb_xy;
 548
 549     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 550     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 551     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 552     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 553     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 554     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 555     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 556 }
 557
 558 /**
 559  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 560  */
 561 static inline int check_intra4x4_pred_mode(H264Context *h){
 562     MpegEncContext * const s = &h->s;
 563     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 564     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 565     int i;
 566
 567     if(!(h->top_samples_available&0x8000)){
 568         for(i=0; i<4; i++){
 569             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 570             if(status<0){
 571                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 572                 return -1;
 573             } else if(status){
 574                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 575             }
 576         }
 577     }
 578
 579     if(!(h->left_samples_available&0x8000)){
 580         for(i=0; i<4; i++){
 581             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 582             if(status<0){
 583                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 584                 return -1;
 585             } else if(status){
 586                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 587             }
 588         }
 589     }
 590
 591     return 0;
 592 } //FIXME cleanup like next
 593
 594 /**
 595  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 596  */
 597 static inline int check_intra_pred_mode(H264Context *h, int mode){
 598     MpegEncContext * const s = &h->s;
 599     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 600     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 601
 602     if(mode > 6U) {
 603         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 604         return -1;
 605     }
 606
 607     if(!(h->top_samples_available&0x8000)){
 608         mode= top[ mode ];
 609         if(mode<0){
 610             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 611             return -1;
 612         }
 613     }
 614
 615     if(!(h->left_samples_available&0x8000)){
 616         mode= left[ mode ];
 617         if(mode<0){
 618             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 619             return -1;
 620         }
 621     }
 622
 623     return mode;
 624 }
 625
 626 /**
 627  * gets the predicted intra4x4 prediction mode.
 628  */
 629 static inline int pred_intra_mode(H264Context *h, int n){
 630     const int index8= scan8[n];
 631     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 632     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 633     const int min= FFMIN(left, top);
 634
 635     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 636
 637     if(min<0) return DC_PRED;
 638     else      return min;
 639 }
 640
 641 static inline void write_back_non_zero_count(H264Context *h){
 642     const int mb_xy= h->mb_xy;
 643
 644     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 645     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 646     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 647     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 648     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 649     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 650     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 651
 652     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 653     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 654     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 655
 656     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 657     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 658     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 659
 660     if(FRAME_MBAFF){
 661         // store all luma nnzs, for deblocking
 662         int v = 0, i;
 663         for(i=0; i<16; i++)
 664             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 665         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 666     }
 667 }
 668
 669 /**
 670  * gets the predicted number of non-zero coefficients.
 671  * @param n block index
 672  */
 673 static inline int pred_non_zero_count(H264Context *h, int n){
 674     const int index8= scan8[n];
 675     const int left= h->non_zero_count_cache[index8 - 1];
 676     const int top = h->non_zero_count_cache[index8 - 8];
 677     int i= left + top;
 678
 679     if(i<64) i= (i+1)>>1;
 680
 681     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 682
 683     return i&31;
 684 }
 685
 686 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 687     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 688     MpegEncContext *s = &h->s;
 689
 690     /* there is no consistent mapping of mvs to neighboring locations that will
 691      * make mbaff happy, so we can't move all this logic to fill_caches */
 692     if(FRAME_MBAFF){
 693         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 694         const int16_t *mv;
 695         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 696         *C = h->mv_cache[list][scan8[0]-2];
 697
 698         if(!MB_FIELD
 699            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 700             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 701             if(IS_INTERLACED(mb_types[topright_xy])){
 702 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 703                 const int x4 = X4, y4 = Y4;\
 704                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 705                 if(!USES_LIST(mb_type,list))\
 706                     return LIST_NOT_USED;\
 707                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 708                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 709                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 710                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 711
 712                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 713             }
 714         }
 715         if(topright_ref == PART_NOT_AVAILABLE
 716            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 717            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 718             if(!MB_FIELD
 719                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 720                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 721             }
 722             if(MB_FIELD
 723                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 724                && i >= scan8[0]+8){
 725                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 726                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 727             }
 728         }
 729 #undef SET_DIAG_MV
 730     }
 731
 732     if(topright_ref != PART_NOT_AVAILABLE){
 733         *C= h->mv_cache[list][ i - 8 + part_width ];
 734         return topright_ref;
 735     }else{
 736         tprintf(s->avctx, "topright MV not available\n");
 737
 738         *C= h->mv_cache[list][ i - 8 - 1 ];
 739         return h->ref_cache[list][ i - 8 - 1 ];
 740     }
 741 }
 742
 743 /**
 744  * gets the predicted MV.
 745  * @param n the block index
 746  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 747  * @param mx the x component of the predicted motion vector
 748  * @param my the y component of the predicted motion vector
 749  */
 750 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 751     const int index8= scan8[n];
 752     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 753     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 754     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 755     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 756     const int16_t * C;
 757     int diagonal_ref, match_count;
 758
 759     assert(part_width==1 || part_width==2 || part_width==4);
 760
 761 /* mv_cache
 762   B . . A T T T T
 763   U . . L . . , .
 764   U . . L . . . .
 765   U . . L . . , .
 766   . . . L . . . .
 767 */
 768
 769     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 770     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 771     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 772     if(match_count > 1){ //most common
 773         *mx= mid_pred(A[0], B[0], C[0]);
 774         *my= mid_pred(A[1], B[1], C[1]);
 775     }else if(match_count==1){
 776         if(left_ref==ref){
 777             *mx= A[0];
 778             *my= A[1];
 779         }else if(top_ref==ref){
 780             *mx= B[0];
 781             *my= B[1];
 782         }else{
 783             *mx= C[0];
 784             *my= C[1];
 785         }
 786     }else{
 787         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 788             *mx= A[0];
 789             *my= A[1];
 790         }else{
 791             *mx= mid_pred(A[0], B[0], C[0]);
 792             *my= mid_pred(A[1], B[1], C[1]);
 793         }
 794     }
 795
 796     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 797 }
 798
 799 /**
 800  * gets the directionally predicted 16x8 MV.
 801  * @param n the block index
 802  * @param mx the x component of the predicted motion vector
 803  * @param my the y component of the predicted motion vector
 804  */
 805 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 806     if(n==0){
 807         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 808         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 809
 810         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 811
 812         if(top_ref == ref){
 813             *mx= B[0];
 814             *my= B[1];
 815             return;
 816         }
 817     }else{
 818         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 819         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 820
 821         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 822
 823         if(left_ref == ref){
 824             *mx= A[0];
 825             *my= A[1];
 826             return;
 827         }
 828     }
 829
 830     //RARE
 831     pred_motion(h, n, 4, list, ref, mx, my);
 832 }
 833
 834 /**
 835  * gets the directionally predicted 8x16 MV.
 836  * @param n the block index
 837  * @param mx the x component of the predicted motion vector
 838  * @param my the y component of the predicted motion vector
 839  */
 840 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 841     if(n==0){
 842         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 843         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 844
 845         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 846
 847         if(left_ref == ref){
 848             *mx= A[0];
 849             *my= A[1];
 850             return;
 851         }
 852     }else{
 853         const int16_t * C;
 854         int diagonal_ref;
 855
 856         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 857
 858         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 859
 860         if(diagonal_ref == ref){
 861             *mx= C[0];
 862             *my= C[1];
 863             return;
 864         }
 865     }
 866
 867     //RARE
 868     pred_motion(h, n, 2, list, ref, mx, my);
 869 }
 870
 871 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 872     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 873     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 874
 875     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 876
 877     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 878        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 879        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 880
 881         *mx = *my = 0;
 882         return;
 883     }
 884
 885     pred_motion(h, 0, 4, 0, 0, mx, my);
 886
 887     return;
 888 }
 889
 890 static inline void direct_dist_scale_factor(H264Context * const h){
 891     MpegEncContext * const s = &h->s;
 892     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 893     const int poc1 = h->ref_list[1][0].poc;
 894     int i;
 895     for(i=0; i<h->ref_count[0]; i++){
 896         int poc0 = h->ref_list[0][i].poc;
 897         int td = av_clip(poc1 - poc0, -128, 127);
 898         if(td == 0 || h->ref_list[0][i].long_ref){
 899             h->dist_scale_factor[i] = 256;
 900         }else{
 901             int tb = av_clip(poc - poc0, -128, 127);
 902             int tx = (16384 + (FFABS(td) >> 1)) / td;
 903             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 904         }
 905     }
 906     if(FRAME_MBAFF){
 907         for(i=0; i<h->ref_count[0]; i++){
 908             h->dist_scale_factor_field[2*i] =
 909             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 910         }
 911     }
 912 }
 913 static inline void direct_ref_list_init(H264Context * const h){
 914     MpegEncContext * const s = &h->s;
 915     Picture * const ref1 = &h->ref_list[1][0];
 916     Picture * const cur = s->current_picture_ptr;
 917     int list, i, j;
 918     int sidx= s->picture_structure&1;
 919     int ref1sidx= ref1->reference&1;
 920     for(list=0; list<2; list++){
 921         cur->ref_count[sidx][list] = h->ref_count[list];
 922         for(j=0; j<h->ref_count[list]; j++)
 923             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 924     }
 925     if(s->picture_structure == PICT_FRAME){
 926         memcpy(cur->ref_count[0], cur->ref_count[1], sizeof(cur->ref_count[0]));
 927         memcpy(cur->ref_poc  [0], cur->ref_poc  [1], sizeof(cur->ref_poc  [0]));
 928     }
 929     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 930         return;
 931     for(list=0; list<2; list++){
 932         for(i=0; i<ref1->ref_count[ref1sidx][list]; i++){
 933             int poc = ref1->ref_poc[ref1sidx][list][i];
 934             if(((poc&3) == 3) != (s->picture_structure == PICT_FRAME))
 935                 poc= (poc&~3) + s->picture_structure;
 936             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 937             for(j=0; j<h->ref_count[list]; j++)
 938                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 939                     h->map_col_to_list0[list][i] = j;
 940                     break;
 941                 }
 942         }
 943     }
 944     if(FRAME_MBAFF){
 945         for(list=0; list<2; list++){
 946             for(i=0; i<ref1->ref_count[ref1sidx][list]; i++){
 947                 j = h->map_col_to_list0[list][i];
 948                 h->map_col_to_list0_field[list][2*i] = 2*j;
 949                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 950             }
 951         }
 952     }
 953 }
 954
 955 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 956     MpegEncContext * const s = &h->s;
 957     int b8_stride = h->b8_stride;
 958     int b4_stride = h->b_stride;
 959     int mb_xy = h->mb_xy;
 960     int mb_type_col[2];
 961     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 962     const int8_t *l1ref0, *l1ref1;
 963     const int is_b8x8 = IS_8X8(*mb_type);
 964     unsigned int sub_mb_type;
 965     int i8, i4;
 966
 967 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 968
 969     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 970         if(h->ref_list[1][0].reference == PICT_FRAME){   // AFL/AFR/FR/FL -> AFL
 971             if(!IS_INTERLACED(*mb_type)){                //     AFR/FR    -> AFL
 972                 int cur_poc = s->current_picture_ptr->poc;
 973                 int *col_poc = h->ref_list[1]->field_poc;
 974                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
 975                 mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
 976                 b8_stride = 0;
 977             }
 978         }else if(!(s->picture_structure & h->ref_list[1][0].reference)){// FL -> FL & differ parity
 979             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
 980             mb_xy += s->mb_stride*fieldoff;
 981         }
 982         goto single_col;
 983     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
 984         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
 985             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
 986             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
 987             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
 988             b8_stride *= 3;
 989             b4_stride *= 6;
 990             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
 991             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
 992                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
 993                 && !is_b8x8){
 994                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 995                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
 996             }else{
 997                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 998                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
 999             }
1000         }else{                                           //     AFR/FR    -> AFR/FR
1001 single_col:
1002             mb_type_col[0] =
1003             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1004             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1005                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1006                 * so we know exactly what block size to use */
1007                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1008                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1009             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1010                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1011                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1012             }else{
1013                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1014                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1015             }
1016         }
1017     }
1018
1019         l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1020         l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1021         l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1022         l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1023     if(!b8_stride){
1024         if(s->mb_y&1){
1025             l1ref0 += h->b8_stride;
1026             l1ref1 += h->b8_stride;
1027             l1mv0  +=  2*b4_stride;
1028             l1mv1  +=  2*b4_stride;
1029         }
1030     }
1031
1032     if(h->direct_spatial_mv_pred){
1033         int ref[2];
1034         int mv[2][2];
1035         int list;
1036
1037         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1038
1039         /* ref = min(neighbors) */
1040         for(list=0; list<2; list++){
1041             int refa = h->ref_cache[list][scan8[0] - 1];
1042             int refb = h->ref_cache[list][scan8[0] - 8];
1043             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1044             if(refc == PART_NOT_AVAILABLE)
1045                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1046             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1047             if(ref[list] < 0)
1048                 ref[list] = -1;
1049         }
1050
1051         if(ref[0] < 0 && ref[1] < 0){
1052             ref[0] = ref[1] = 0;
1053             mv[0][0] = mv[0][1] =
1054             mv[1][0] = mv[1][1] = 0;
1055         }else{
1056             for(list=0; list<2; list++){
1057                 if(ref[list] >= 0)
1058                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1059                 else
1060                     mv[list][0] = mv[list][1] = 0;
1061             }
1062         }
1063
1064         if(ref[1] < 0){
1065             if(!is_b8x8)
1066                 *mb_type &= ~MB_TYPE_L1;
1067             sub_mb_type &= ~MB_TYPE_L1;
1068         }else if(ref[0] < 0){
1069             if(!is_b8x8)
1070                 *mb_type &= ~MB_TYPE_L0;
1071             sub_mb_type &= ~MB_TYPE_L0;
1072         }
1073
1074         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1075             for(i8=0; i8<4; i8++){
1076                 int x8 = i8&1;
1077                 int y8 = i8>>1;
1078                 int xy8 = x8+y8*b8_stride;
1079                 int xy4 = 3*x8+y8*b4_stride;
1080                 int a=0, b=0;
1081
1082                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1083                     continue;
1084                 h->sub_mb_type[i8] = sub_mb_type;
1085
1086                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1087                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1088                 if(!IS_INTRA(mb_type_col[y8])
1089                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1090                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1091                     if(ref[0] > 0)
1092                         a= pack16to32(mv[0][0],mv[0][1]);
1093                     if(ref[1] > 0)
1094                         b= pack16to32(mv[1][0],mv[1][1]);
1095                 }else{
1096                     a= pack16to32(mv[0][0],mv[0][1]);
1097                     b= pack16to32(mv[1][0],mv[1][1]);
1098                 }
1099                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1100                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1101             }
1102         }else if(IS_16X16(*mb_type)){
1103             int a=0, b=0;
1104
1105             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1106             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1107             if(!IS_INTRA(mb_type_col[0])
1108                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1109                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1110                        && (h->x264_build>33 || !h->x264_build)))){
1111                 if(ref[0] > 0)
1112                     a= pack16to32(mv[0][0],mv[0][1]);
1113                 if(ref[1] > 0)
1114                     b= pack16to32(mv[1][0],mv[1][1]);
1115             }else{
1116                 a= pack16to32(mv[0][0],mv[0][1]);
1117                 b= pack16to32(mv[1][0],mv[1][1]);
1118             }
1119             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1120             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1121         }else{
1122             for(i8=0; i8<4; i8++){
1123                 const int x8 = i8&1;
1124                 const int y8 = i8>>1;
1125
1126                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1127                     continue;
1128                 h->sub_mb_type[i8] = sub_mb_type;
1129
1130                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1131                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1132                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1133                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1134
1135                 /* col_zero_flag */
1136                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1137                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1138                                                   && (h->x264_build>33 || !h->x264_build)))){
1139                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1140                     if(IS_SUB_8X8(sub_mb_type)){
1141                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1142                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1143                             if(ref[0] == 0)
1144                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1145                             if(ref[1] == 0)
1146                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1147                         }
1148                     }else
1149                     for(i4=0; i4<4; i4++){
1150                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1151                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1152                             if(ref[0] == 0)
1153                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1154                             if(ref[1] == 0)
1155                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1156                         }
1157                     }
1158                 }
1159             }
1160         }
1161     }else{ /* direct temporal mv pred */
1162         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1163         const int *dist_scale_factor = h->dist_scale_factor;
1164
1165         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1166             map_col_to_list0[0] = h->map_col_to_list0_field[0];
1167             map_col_to_list0[1] = h->map_col_to_list0_field[1];
1168             dist_scale_factor = h->dist_scale_factor_field;
1169         }
1170         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1171             /* FIXME assumes direct_8x8_inference == 1 */
1172             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1173             int ref_shift= FRAME_MBAFF ? y_shift : 1;
1174
1175             for(i8=0; i8<4; i8++){
1176                 const int x8 = i8&1;
1177                 const int y8 = i8>>1;
1178                 int ref0, scale;
1179                 const int16_t (*l1mv)[2]= l1mv0;
1180
1181                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1182                     continue;
1183                 h->sub_mb_type[i8] = sub_mb_type;
1184
1185                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1186                 if(IS_INTRA(mb_type_col[y8])){
1187                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1188                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1189                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1190                     continue;
1191                 }
1192
1193                 ref0 = l1ref0[x8 + y8*b8_stride];
1194                 if(ref0 >= 0)
1195                     ref0 = map_col_to_list0[0][ref0*2>>ref_shift];
1196                 else{
1197                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride]*2>>ref_shift];
1198                     l1mv= l1mv1;
1199                 }
1200                 scale = dist_scale_factor[ref0];
1201                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1202
1203                 {
1204                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1205                     int my_col = (mv_col[1]<<y_shift)/2;
1206                     int mx = (scale * mv_col[0] + 128) >> 8;
1207                     int my = (scale * my_col + 128) >> 8;
1208                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1209                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1210                 }
1211             }
1212             return;
1213         }
1214
1215         /* one-to-one mv scaling */
1216
1217         if(IS_16X16(*mb_type)){
1218             int ref, mv0, mv1;
1219
1220             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1221             if(IS_INTRA(mb_type_col[0])){
1222                 ref=mv0=mv1=0;
1223             }else{
1224                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1225                                                 : map_col_to_list0[1][l1ref1[0]];
1226                 const int scale = dist_scale_factor[ref0];
1227                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1228                 int mv_l0[2];
1229                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1230                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1231                 ref= ref0;
1232                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1233                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1234             }
1235             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1236             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1237             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1238         }else{
1239             for(i8=0; i8<4; i8++){
1240                 const int x8 = i8&1;
1241                 const int y8 = i8>>1;
1242                 int ref0, scale;
1243                 const int16_t (*l1mv)[2]= l1mv0;
1244
1245                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1246                     continue;
1247                 h->sub_mb_type[i8] = sub_mb_type;
1248                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1249                 if(IS_INTRA(mb_type_col[0])){
1250                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1251                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1252                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1253                     continue;
1254                 }
1255
1256                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1257                 if(ref0 >= 0)
1258                     ref0 = map_col_to_list0[0][ref0];
1259                 else{
1260                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1261                     l1mv= l1mv1;
1262                 }
1263                 scale = dist_scale_factor[ref0];
1264
1265                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1266                 if(IS_SUB_8X8(sub_mb_type)){
1267                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1268                     int mx = (scale * mv_col[0] + 128) >> 8;
1269                     int my = (scale * mv_col[1] + 128) >> 8;
1270                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1271                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1272                 }else
1273                 for(i4=0; i4<4; i4++){
1274                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1275                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1276                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1277                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1278                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1279                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1280                 }
1281             }
1282         }
1283     }
1284 }
1285
1286 static inline void write_back_motion(H264Context *h, int mb_type){
1287     MpegEncContext * const s = &h->s;
1288     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1289     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1290     int list;
1291
1292     if(!USES_LIST(mb_type, 0))
1293         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1294
1295     for(list=0; list<h->list_count; list++){
1296         int y;
1297         if(!USES_LIST(mb_type, list))
1298             continue;
1299
1300         for(y=0; y<4; y++){
1301             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1302             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1303         }
1304         if( h->pps.cabac ) {
1305             if(IS_SKIP(mb_type))
1306                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1307             else
1308             for(y=0; y<4; y++){
1309                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1310                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1311             }
1312         }
1313
1314         {
1315             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1316             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1317             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1318             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1319             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1320         }
1321     }
1322
1323     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1324         if(IS_8X8(mb_type)){
1325             uint8_t *direct_table = &h->direct_table[b8_xy];
1326             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1327             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1328             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1329         }
1330     }
1331 }
1332
1333 /**
1334  * Decodes a network abstraction layer unit.
1335  * @param consumed is the number of bytes used as input
1336  * @param length is the length of the array
1337  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1338  * @returns decoded bytes, might be src+1 if no escapes
1339  */
1340 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1341     int i, si, di;
1342     uint8_t *dst;
1343     int bufidx;
1344
1345 //    src[0]&0x80;                //forbidden bit
1346     h->nal_ref_idc= src[0]>>5;
1347     h->nal_unit_type= src[0]&0x1F;
1348
1349     src++; length--;
1350 #if 0
1351     for(i=0; i<length; i++)
1352         printf("%2X ", src[i]);
1353 #endif
1354     for(i=0; i+1<length; i+=2){
1355         if(src[i]) continue;
1356         if(i>0 && src[i-1]==0) i--;
1357         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1358             if(src[i+2]!=3){
1359                 /* startcode, so we must be past the end */
1360                 length=i;
1361             }
1362             break;
1363         }
1364     }
1365
1366     if(i>=length-1){ //no escaped 0
1367         *dst_length= length;
1368         *consumed= length+1; //+1 for the header
1369         return src;
1370     }
1371
1372     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1373     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1374     dst= h->rbsp_buffer[bufidx];
1375
1376     if (dst == NULL){
1377         return NULL;
1378     }
1379
1380 //printf("decoding esc\n");
1381     si=di=0;
1382     while(si<length){
1383         //remove escapes (very rare 1:2^22)
1384         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1385             if(src[si+2]==3){ //escape
1386                 dst[di++]= 0;
1387                 dst[di++]= 0;
1388                 si+=3;
1389                 continue;
1390             }else //next start code
1391                 break;
1392         }
1393
1394         dst[di++]= src[si++];
1395     }
1396
1397     *dst_length= di;
1398     *consumed= si + 1;//+1 for the header
1399 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1400     return dst;
1401 }
1402
1403 /**
1404  * identifies the exact end of the bitstream
1405  * @return the length of the trailing, or 0 if damaged
1406  */
1407 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1408     int v= *src;
1409     int r;
1410
1411     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1412
1413     for(r=1; r<9; r++){
1414         if(v&1) return r;
1415         v>>=1;
1416     }
1417     return 0;
1418 }
1419
1420 /**
1421  * IDCT transforms the 16 dc values and dequantizes them.
1422  * @param qp quantization parameter
1423  */
1424 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1425 #define stride 16
1426     int i;
1427     int temp[16]; //FIXME check if this is a good idea
1428     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1429     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1430
1431 //memset(block, 64, 2*256);
1432 //return;
1433     for(i=0; i<4; i++){
1434         const int offset= y_offset[i];
1435         const int z0= block[offset+stride*0] + block[offset+stride*4];
1436         const int z1= block[offset+stride*0] - block[offset+stride*4];
1437         const int z2= block[offset+stride*1] - block[offset+stride*5];
1438         const int z3= block[offset+stride*1] + block[offset+stride*5];
1439
1440         temp[4*i+0]= z0+z3;
1441         temp[4*i+1]= z1+z2;
1442         temp[4*i+2]= z1-z2;
1443         temp[4*i+3]= z0-z3;
1444     }
1445
1446     for(i=0; i<4; i++){
1447         const int offset= x_offset[i];
1448         const int z0= temp[4*0+i] + temp[4*2+i];
1449         const int z1= temp[4*0+i] - temp[4*2+i];
1450         const int z2= temp[4*1+i] - temp[4*3+i];
1451         const int z3= temp[4*1+i] + temp[4*3+i];
1452
1453         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1454         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1455         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1456         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1457     }
1458 }
1459
1460 #if 0
1461 /**
1462  * DCT transforms the 16 dc values.
1463  * @param qp quantization parameter ??? FIXME
1464  */
1465 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1466 //    const int qmul= dequant_coeff[qp][0];
1467     int i;
1468     int temp[16]; //FIXME check if this is a good idea
1469     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1470     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1471
1472     for(i=0; i<4; i++){
1473         const int offset= y_offset[i];
1474         const int z0= block[offset+stride*0] + block[offset+stride*4];
1475         const int z1= block[offset+stride*0] - block[offset+stride*4];
1476         const int z2= block[offset+stride*1] - block[offset+stride*5];
1477         const int z3= block[offset+stride*1] + block[offset+stride*5];
1478
1479         temp[4*i+0]= z0+z3;
1480         temp[4*i+1]= z1+z2;
1481         temp[4*i+2]= z1-z2;
1482         temp[4*i+3]= z0-z3;
1483     }
1484
1485     for(i=0; i<4; i++){
1486         const int offset= x_offset[i];
1487         const int z0= temp[4*0+i] + temp[4*2+i];
1488         const int z1= temp[4*0+i] - temp[4*2+i];
1489         const int z2= temp[4*1+i] - temp[4*3+i];
1490         const int z3= temp[4*1+i] + temp[4*3+i];
1491
1492         block[stride*0 +offset]= (z0 + z3)>>1;
1493         block[stride*2 +offset]= (z1 + z2)>>1;
1494         block[stride*8 +offset]= (z1 - z2)>>1;
1495         block[stride*10+offset]= (z0 - z3)>>1;
1496     }
1497 }
1498 #endif
1499
1500 #undef xStride
1501 #undef stride
1502
1503 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1504     const int stride= 16*2;
1505     const int xStride= 16;
1506     int a,b,c,d,e;
1507
1508     a= block[stride*0 + xStride*0];
1509     b= block[stride*0 + xStride*1];
1510     c= block[stride*1 + xStride*0];
1511     d= block[stride*1 + xStride*1];
1512
1513     e= a-b;
1514     a= a+b;
1515     b= c-d;
1516     c= c+d;
1517
1518     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1519     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1520     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1521     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1522 }
1523
1524 #if 0
1525 static void chroma_dc_dct_c(DCTELEM *block){
1526     const int stride= 16*2;
1527     const int xStride= 16;
1528     int a,b,c,d,e;
1529
1530     a= block[stride*0 + xStride*0];
1531     b= block[stride*0 + xStride*1];
1532     c= block[stride*1 + xStride*0];
1533     d= block[stride*1 + xStride*1];
1534
1535     e= a-b;
1536     a= a+b;
1537     b= c-d;
1538     c= c+d;
1539
1540     block[stride*0 + xStride*0]= (a+c);
1541     block[stride*0 + xStride*1]= (e+b);
1542     block[stride*1 + xStride*0]= (a-c);
1543     block[stride*1 + xStride*1]= (e-b);
1544 }
1545 #endif
1546
1547 /**
1548  * gets the chroma qp.
1549  */
1550 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1551     return h->pps.chroma_qp_table[t][qscale];
1552 }
1553
1554 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1555 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1556 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1557     int i;
1558     const int * const quant_table= quant_coeff[qscale];
1559     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1560     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1561     const unsigned int threshold2= (threshold1<<1);
1562     int last_non_zero;
1563
1564     if(separate_dc){
1565         if(qscale<=18){
1566             //avoid overflows
1567             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1568             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1569             const unsigned int dc_threshold2= (dc_threshold1<<1);
1570
1571             int level= block[0]*quant_coeff[qscale+18][0];
1572             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1573                 if(level>0){
1574                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1575                     block[0]= level;
1576                 }else{
1577                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1578                     block[0]= -level;
1579                 }
1580 //                last_non_zero = i;
1581             }else{
1582                 block[0]=0;
1583             }
1584         }else{
1585             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1586             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1587             const unsigned int dc_threshold2= (dc_threshold1<<1);
1588
1589             int level= block[0]*quant_table[0];
1590             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1591                 if(level>0){
1592                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1593                     block[0]= level;
1594                 }else{
1595                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1596                     block[0]= -level;
1597                 }
1598 //                last_non_zero = i;
1599             }else{
1600                 block[0]=0;
1601             }
1602         }
1603         last_non_zero= 0;
1604         i=1;
1605     }else{
1606         last_non_zero= -1;
1607         i=0;
1608     }
1609
1610     for(; i<16; i++){
1611         const int j= scantable[i];
1612         int level= block[j]*quant_table[j];
1613
1614 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1615 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1616         if(((unsigned)(level+threshold1))>threshold2){
1617             if(level>0){
1618                 level= (bias + level)>>QUANT_SHIFT;
1619                 block[j]= level;
1620             }else{
1621                 level= (bias - level)>>QUANT_SHIFT;
1622                 block[j]= -level;
1623             }
1624             last_non_zero = i;
1625         }else{
1626             block[j]=0;
1627         }
1628     }
1629
1630     return last_non_zero;
1631 }
1632
1633 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1634                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1635                            int src_x_offset, int src_y_offset,
1636                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1637     MpegEncContext * const s = &h->s;
1638     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1639     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1640     const int luma_xy= (mx&3) + ((my&3)<<2);
1641     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1642     uint8_t * src_cb, * src_cr;
1643     int extra_width= h->emu_edge_width;
1644     int extra_height= h->emu_edge_height;
1645     int emu=0;
1646     const int full_mx= mx>>2;
1647     const int full_my= my>>2;
1648     const int pic_width  = 16*s->mb_width;
1649     const int pic_height = 16*s->mb_height >> MB_FIELD;
1650
1651     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1652         return;
1653
1654     if(mx&7) extra_width -= 3;
1655     if(my&7) extra_height -= 3;
1656
1657     if(   full_mx < 0-extra_width
1658        || full_my < 0-extra_height
1659        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1660        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1661         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1662             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1663         emu=1;
1664     }
1665
1666     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1667     if(!square){
1668         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1669     }
1670
1671     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1672
1673     if(MB_FIELD){
1674         // chroma offset when predicting from a field of opposite parity
1675         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1676         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1677     }
1678     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1679     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1680
1681     if(emu){
1682         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1683             src_cb= s->edge_emu_buffer;
1684     }
1685     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1686
1687     if(emu){
1688         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1689             src_cr= s->edge_emu_buffer;
1690     }
1691     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1692 }
1693
1694 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1699                            int list0, int list1){
1700     MpegEncContext * const s = &h->s;
1701     qpel_mc_func *qpix_op=  qpix_put;
1702     h264_chroma_mc_func chroma_op= chroma_put;
1703
1704     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1705     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1707     x_offset += 8*s->mb_x;
1708     y_offset += 8*(s->mb_y >> MB_FIELD);
1709
1710     if(list0){
1711         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1712         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1713                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1714                            qpix_op, chroma_op);
1715
1716         qpix_op=  qpix_avg;
1717         chroma_op= chroma_avg;
1718     }
1719
1720     if(list1){
1721         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1722         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1723                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1724                            qpix_op, chroma_op);
1725     }
1726 }
1727
1728 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1729                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1730                            int x_offset, int y_offset,
1731                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1732                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1733                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1734                            int list0, int list1){
1735     MpegEncContext * const s = &h->s;
1736
1737     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1738     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1739     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1740     x_offset += 8*s->mb_x;
1741     y_offset += 8*(s->mb_y >> MB_FIELD);
1742
1743     if(list0 && list1){
1744         /* don't optimize for luma-only case, since B-frames usually
1745          * use implicit weights => chroma too. */
1746         uint8_t *tmp_cb = s->obmc_scratchpad;
1747         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1748         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1749         int refn0 = h->ref_cache[0][ scan8[n] ];
1750         int refn1 = h->ref_cache[1][ scan8[n] ];
1751
1752         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1753                     dest_y, dest_cb, dest_cr,
1754                     x_offset, y_offset, qpix_put, chroma_put);
1755         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1756                     tmp_y, tmp_cb, tmp_cr,
1757                     x_offset, y_offset, qpix_put, chroma_put);
1758
1759         if(h->use_weight == 2){
1760             int weight0 = h->implicit_weight[refn0][refn1];
1761             int weight1 = 64 - weight0;
1762             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1763             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1764             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1765         }else{
1766             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1767                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1768                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1769             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1770                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1771                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1772             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1773                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1774                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1775         }
1776     }else{
1777         int list = list1 ? 1 : 0;
1778         int refn = h->ref_cache[list][ scan8[n] ];
1779         Picture *ref= &h->ref_list[list][refn];
1780         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1781                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1782                     qpix_put, chroma_put);
1783
1784         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1785                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1786         if(h->use_weight_chroma){
1787             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1788                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1789             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1790                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1791         }
1792     }
1793 }
1794
1795 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1796                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1797                            int x_offset, int y_offset,
1798                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1799                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1800                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1801                            int list0, int list1){
1802     if((h->use_weight==2 && list0 && list1
1803         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1804        || h->use_weight==1)
1805         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1806                          x_offset, y_offset, qpix_put, chroma_put,
1807                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1808     else
1809         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1810                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1811 }
1812
1813 static inline void prefetch_motion(H264Context *h, int list){
1814     /* fetch pixels for estimated mv 4 macroblocks ahead
1815      * optimized for 64byte cache lines */
1816     MpegEncContext * const s = &h->s;
1817     const int refn = h->ref_cache[list][scan8[0]];
1818     if(refn >= 0){
1819         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1820         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1821         uint8_t **src= h->ref_list[list][refn].data;
1822         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1823         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1824         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1825         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1826     }
1827 }
1828
1829 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1830                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1831                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1832                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1833     MpegEncContext * const s = &h->s;
1834     const int mb_xy= h->mb_xy;
1835     const int mb_type= s->current_picture.mb_type[mb_xy];
1836
1837     assert(IS_INTER(mb_type));
1838
1839     prefetch_motion(h, 0);
1840
1841     if(IS_16X16(mb_type)){
1842         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1843                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1844                 &weight_op[0], &weight_avg[0],
1845                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1846     }else if(IS_16X8(mb_type)){
1847         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1848                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1849                 &weight_op[1], &weight_avg[1],
1850                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1851         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1852                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1853                 &weight_op[1], &weight_avg[1],
1854                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1855     }else if(IS_8X16(mb_type)){
1856         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1857                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1858                 &weight_op[2], &weight_avg[2],
1859                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1860         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1861                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1862                 &weight_op[2], &weight_avg[2],
1863                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1864     }else{
1865         int i;
1866
1867         assert(IS_8X8(mb_type));
1868
1869         for(i=0; i<4; i++){
1870             const int sub_mb_type= h->sub_mb_type[i];
1871             const int n= 4*i;
1872             int x_offset= (i&1)<<2;
1873             int y_offset= (i&2)<<1;
1874
1875             if(IS_SUB_8X8(sub_mb_type)){
1876                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1877                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1878                     &weight_op[3], &weight_avg[3],
1879                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1880             }else if(IS_SUB_8X4(sub_mb_type)){
1881                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1882                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1883                     &weight_op[4], &weight_avg[4],
1884                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1885                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1886                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1887                     &weight_op[4], &weight_avg[4],
1888                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1889             }else if(IS_SUB_4X8(sub_mb_type)){
1890                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1891                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1892                     &weight_op[5], &weight_avg[5],
1893                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1894                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1895                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1896                     &weight_op[5], &weight_avg[5],
1897                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1898             }else{
1899                 int j;
1900                 assert(IS_SUB_4X4(sub_mb_type));
1901                 for(j=0; j<4; j++){
1902                     int sub_x_offset= x_offset + 2*(j&1);
1903                     int sub_y_offset= y_offset +   (j&2);
1904                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1905                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1906                         &weight_op[6], &weight_avg[6],
1907                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1908                 }
1909             }
1910         }
1911     }
1912
1913     prefetch_motion(h, 1);
1914 }
1915
1916 static av_cold void decode_init_vlc(void){
1917     static int done = 0;
1918
1919     if (!done) {
1920         int i;
1921         int offset;
1922         done = 1;
1923
1924         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1925         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1926         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1927                  &chroma_dc_coeff_token_len [0], 1, 1,
1928                  &chroma_dc_coeff_token_bits[0], 1, 1,
1929                  INIT_VLC_USE_NEW_STATIC);
1930
1931         offset = 0;
1932         for(i=0; i<4; i++){
1933             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1934             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1935             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1936                      &coeff_token_len [i][0], 1, 1,
1937                      &coeff_token_bits[i][0], 1, 1,
1938                      INIT_VLC_USE_NEW_STATIC);
1939             offset += coeff_token_vlc_tables_size[i];
1940         }
1941         /*
1942          * This is a one time safety check to make sure that
1943          * the packed static coeff_token_vlc table sizes
1944          * were initialized correctly.
1945          */
1946         assert(offset == sizeof(coeff_token_vlc_tables)/(sizeof(VLC_TYPE)*2));
1947
1948         for(i=0; i<3; i++){
1949             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1950             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1951             init_vlc(&chroma_dc_total_zeros_vlc[i],
1952                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1953                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1954                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957         for(i=0; i<15; i++){
1958             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1959             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1960             init_vlc(&total_zeros_vlc[i],
1961                      TOTAL_ZEROS_VLC_BITS, 16,
1962                      &total_zeros_len [i][0], 1, 1,
1963                      &total_zeros_bits[i][0], 1, 1,
1964                      INIT_VLC_USE_NEW_STATIC);
1965         }
1966
1967         for(i=0; i<6; i++){
1968             run_vlc[i].table = run_vlc_tables[i];
1969             run_vlc[i].table_allocated = run_vlc_tables_size;
1970             init_vlc(&run_vlc[i],
1971                      RUN_VLC_BITS, 7,
1972                      &run_len [i][0], 1, 1,
1973                      &run_bits[i][0], 1, 1,
1974                      INIT_VLC_USE_NEW_STATIC);
1975         }
1976         run7_vlc.table = run7_vlc_table,
1977         run7_vlc.table_allocated = run7_vlc_table_size;
1978         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1979                  &run_len [6][0], 1, 1,
1980                  &run_bits[6][0], 1, 1,
1981                  INIT_VLC_USE_NEW_STATIC);
1982     }
1983 }
1984
1985 static void free_tables(H264Context *h){
1986     int i;
1987     H264Context *hx;
1988     av_freep(&h->intra4x4_pred_mode);
1989     av_freep(&h->chroma_pred_mode_table);
1990     av_freep(&h->cbp_table);
1991     av_freep(&h->mvd_table[0]);
1992     av_freep(&h->mvd_table[1]);
1993     av_freep(&h->direct_table);
1994     av_freep(&h->non_zero_count);
1995     av_freep(&h->slice_table_base);
1996     h->slice_table= NULL;
1997
1998     av_freep(&h->mb2b_xy);
1999     av_freep(&h->mb2b8_xy);
2000
2001     for(i = 0; i < MAX_SPS_COUNT; i++)
2002         av_freep(h->sps_buffers + i);
2003
2004     for(i = 0; i < MAX_PPS_COUNT; i++)
2005         av_freep(h->pps_buffers + i);
2006
2007     for(i = 0; i < h->s.avctx->thread_count; i++) {
2008         hx = h->thread_context[i];
2009         if(!hx) continue;
2010         av_freep(&hx->top_borders[1]);
2011         av_freep(&hx->top_borders[0]);
2012         av_freep(&hx->s.obmc_scratchpad);
2013     }
2014 }
2015
2016 static void init_dequant8_coeff_table(H264Context *h){
2017     int i,q,x;
2018     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2019     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2020     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2021
2022     for(i=0; i<2; i++ ){
2023         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2024             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2025             break;
2026         }
2027
2028         for(q=0; q<52; q++){
2029             int shift = ff_div6[q];
2030             int idx = ff_rem6[q];
2031             for(x=0; x<64; x++)
2032                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2033                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2034                     h->pps.scaling_matrix8[i][x]) << shift;
2035         }
2036     }
2037 }
2038
2039 static void init_dequant4_coeff_table(H264Context *h){
2040     int i,j,q,x;
2041     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2042     for(i=0; i<6; i++ ){
2043         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2044         for(j=0; j<i; j++){
2045             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2046                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2047                 break;
2048             }
2049         }
2050         if(j<i)
2051             continue;
2052
2053         for(q=0; q<52; q++){
2054             int shift = ff_div6[q] + 2;
2055             int idx = ff_rem6[q];
2056             for(x=0; x<16; x++)
2057                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2058                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2059                     h->pps.scaling_matrix4[i][x]) << shift;
2060         }
2061     }
2062 }
2063
2064 static void init_dequant_tables(H264Context *h){
2065     int i,x;
2066     init_dequant4_coeff_table(h);
2067     if(h->pps.transform_8x8_mode)
2068         init_dequant8_coeff_table(h);
2069     if(h->sps.transform_bypass){
2070         for(i=0; i<6; i++)
2071             for(x=0; x<16; x++)
2072                 h->dequant4_coeff[i][0][x] = 1<<6;
2073         if(h->pps.transform_8x8_mode)
2074             for(i=0; i<2; i++)
2075                 for(x=0; x<64; x++)
2076                     h->dequant8_coeff[i][0][x] = 1<<6;
2077     }
2078 }
2079
2080
2081 /**
2082  * allocates tables.
2083  * needs width/height
2084  */
2085 static int alloc_tables(H264Context *h){
2086     MpegEncContext * const s = &h->s;
2087     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2088     int x,y;
2089
2090     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2091
2092     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2093     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2094     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2095
2096     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2097     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2098     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2099     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2100
2101     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2102     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2103
2104     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2105     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2106     for(y=0; y<s->mb_height; y++){
2107         for(x=0; x<s->mb_width; x++){
2108             const int mb_xy= x + y*s->mb_stride;
2109             const int b_xy = 4*x + 4*y*h->b_stride;
2110             const int b8_xy= 2*x + 2*y*h->b8_stride;
2111
2112             h->mb2b_xy [mb_xy]= b_xy;
2113             h->mb2b8_xy[mb_xy]= b8_xy;
2114         }
2115     }
2116
2117     s->obmc_scratchpad = NULL;
2118
2119     if(!h->dequant4_coeff[0])
2120         init_dequant_tables(h);
2121
2122     return 0;
2123 fail:
2124     free_tables(h);
2125     return -1;
2126 }
2127
2128 /**
2129  * Mimic alloc_tables(), but for every context thread.
2130  */
2131 static void clone_tables(H264Context *dst, H264Context *src){
2132     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2133     dst->non_zero_count           = src->non_zero_count;
2134     dst->slice_table              = src->slice_table;
2135     dst->cbp_table                = src->cbp_table;
2136     dst->mb2b_xy                  = src->mb2b_xy;
2137     dst->mb2b8_xy                 = src->mb2b8_xy;
2138     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2139     dst->mvd_table[0]             = src->mvd_table[0];
2140     dst->mvd_table[1]             = src->mvd_table[1];
2141     dst->direct_table             = src->direct_table;
2142
2143     dst->s.obmc_scratchpad = NULL;
2144     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2145 }
2146
2147 /**
2148  * Init context
2149  * Allocate buffers which are not shared amongst multiple threads.
2150  */
2151 static int context_init(H264Context *h){
2152     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2153     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2154
2155     return 0;
2156 fail:
2157     return -1; // free_tables will clean up for us
2158 }
2159
2160 static av_cold void common_init(H264Context *h){
2161     MpegEncContext * const s = &h->s;
2162
2163     s->width = s->avctx->width;
2164     s->height = s->avctx->height;
2165     s->codec_id= s->avctx->codec->id;
2166
2167     ff_h264_pred_init(&h->hpc, s->codec_id);
2168
2169     h->dequant_coeff_pps= -1;
2170     s->unrestricted_mv=1;
2171     s->decode=1; //FIXME
2172
2173     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2174     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2175 }
2176
2177 static av_cold int decode_init(AVCodecContext *avctx){
2178     H264Context *h= avctx->priv_data;
2179     MpegEncContext * const s = &h->s;
2180
2181     MPV_decode_defaults(s);
2182
2183     s->avctx = avctx;
2184     common_init(h);
2185
2186     s->out_format = FMT_H264;
2187     s->workaround_bugs= avctx->workaround_bugs;
2188
2189     // set defaults
2190 //    s->decode_mb= ff_h263_decode_mb;
2191     s->quarter_sample = 1;
2192     s->low_delay= 1;
2193
2194     if(avctx->codec_id == CODEC_ID_SVQ3)
2195         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2196     else
2197         avctx->pix_fmt= PIX_FMT_YUV420P;
2198
2199     decode_init_vlc();
2200
2201     if(avctx->extradata_size > 0 && avctx->extradata &&
2202        *(char *)avctx->extradata == 1){
2203         h->is_avc = 1;
2204         h->got_avcC = 0;
2205     } else {
2206         h->is_avc = 0;
2207     }
2208
2209     h->thread_context[0] = h;
2210     h->outputed_poc = INT_MIN;
2211     return 0;
2212 }
2213
2214 static int frame_start(H264Context *h){
2215     MpegEncContext * const s = &h->s;
2216     int i;
2217
2218     if(MPV_frame_start(s, s->avctx) < 0)
2219         return -1;
2220     ff_er_frame_start(s);
2221     /*
2222      * MPV_frame_start uses pict_type to derive key_frame.
2223      * This is incorrect for H.264; IDR markings must be used.
2224      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2225      * See decode_nal_units().
2226      */
2227     s->current_picture_ptr->key_frame= 0;
2228
2229     assert(s->linesize && s->uvlinesize);
2230
2231     for(i=0; i<16; i++){
2232         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2233         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2234     }
2235     for(i=0; i<4; i++){
2236         h->block_offset[16+i]=
2237         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+16+i]=
2239         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2240     }
2241
2242     /* can't be in alloc_tables because linesize isn't known there.
2243      * FIXME: redo bipred weight to not require extra buffer? */
2244     for(i = 0; i < s->avctx->thread_count; i++)
2245         if(!h->thread_context[i]->s.obmc_scratchpad)
2246             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2247
2248     /* some macroblocks will be accessed before they're available */
2249     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2250         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2251
2252 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2253
2254     // We mark the current picture as non-reference after allocating it, so
2255     // that if we break out due to an error it can be released automatically
2256     // in the next MPV_frame_start().
2257     // SVQ3 as well as most other codecs have only last/next/current and thus
2258     // get released even with set reference, besides SVQ3 and others do not
2259     // mark frames as reference later "naturally".
2260     if(s->codec_id != CODEC_ID_SVQ3)
2261         s->current_picture_ptr->reference= 0;
2262
2263     s->current_picture_ptr->field_poc[0]=
2264     s->current_picture_ptr->field_poc[1]= INT_MAX;
2265     assert(s->current_picture_ptr->long_ref==0);
2266
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273
2274     src_y  -=   linesize;
2275     src_cb -= uvlinesize;
2276     src_cr -= uvlinesize;
2277
2278     // There are two lines saved, the line above the the top macroblock of a pair,
2279     // and the line above the bottom macroblock
2280     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2281     for(i=1; i<17; i++){
2282         h->left_border[i]= src_y[15+i*  linesize];
2283     }
2284
2285     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2286     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2287
2288     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2289         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2290         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2291         for(i=1; i<9; i++){
2292             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2293             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2294         }
2295         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2296         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2297     }
2298 }
2299
2300 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2301     MpegEncContext * const s = &h->s;
2302     int temp8, i;
2303     uint64_t temp64;
2304     int deblock_left;
2305     int deblock_top;
2306     int mb_xy;
2307
2308     if(h->deblocking_filter == 2) {
2309         mb_xy = h->mb_xy;
2310         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2311         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2312     } else {
2313         deblock_left = (s->mb_x > 0);
2314         deblock_top =  (s->mb_y > 0);
2315     }
2316
2317     src_y  -=   linesize + 1;
2318     src_cb -= uvlinesize + 1;
2319     src_cr -= uvlinesize + 1;
2320
2321 #define XCHG(a,b,t,xchg)\
2322 t= a;\
2323 if(xchg)\
2324     a= b;\
2325 b= t;
2326
2327     if(deblock_left){
2328         for(i = !deblock_top; i<17; i++){
2329             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2330         }
2331     }
2332
2333     if(deblock_top){
2334         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2335         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2336         if(s->mb_x+1 < s->mb_width){
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2338         }
2339     }
2340
2341     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2342         if(deblock_left){
2343             for(i = !deblock_top; i<9; i++){
2344                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2345                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2346             }
2347         }
2348         if(deblock_top){
2349             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2350             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2351         }
2352     }
2353 }
2354
2355 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2356     MpegEncContext * const s = &h->s;
2357     int i;
2358
2359     src_y  -= 2 *   linesize;
2360     src_cb -= 2 * uvlinesize;
2361     src_cr -= 2 * uvlinesize;
2362
2363     // There are two lines saved, the line above the the top macroblock of a pair,
2364     // and the line above the bottom macroblock
2365     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2366     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2367     for(i=2; i<34; i++){
2368         h->left_border[i]= src_y[15+i*  linesize];
2369     }
2370
2371     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2372     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2373     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2374     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2375
2376     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2377         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2378         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2379         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2380         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2381         for(i=2; i<18; i++){
2382             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2383             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2384         }
2385         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2386         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2387         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2388         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2389     }
2390 }
2391
2392 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2393     MpegEncContext * const s = &h->s;
2394     int temp8, i;
2395     uint64_t temp64;
2396     int deblock_left = (s->mb_x > 0);
2397     int deblock_top  = (s->mb_y > 1);
2398
2399     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2400
2401     src_y  -= 2 *   linesize + 1;
2402     src_cb -= 2 * uvlinesize + 1;
2403     src_cr -= 2 * uvlinesize + 1;
2404
2405 #define XCHG(a,b,t,xchg)\
2406 t= a;\
2407 if(xchg)\
2408     a= b;\
2409 b= t;
2410
2411     if(deblock_left){
2412         for(i = (!deblock_top)<<1; i<34; i++){
2413             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2414         }
2415     }
2416
2417     if(deblock_top){
2418         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2419         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2420         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2421         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2422         if(s->mb_x+1 < s->mb_width){
2423             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2424             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2425         }
2426     }
2427
2428     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2429         if(deblock_left){
2430             for(i = (!deblock_top) << 1; i<18; i++){
2431                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2432                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2433             }
2434         }
2435         if(deblock_top){
2436             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2437             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2438             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2439             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2440         }
2441     }
2442 }
2443
2444 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2445     MpegEncContext * const s = &h->s;
2446     const int mb_x= s->mb_x;
2447     const int mb_y= s->mb_y;
2448     const int mb_xy= h->mb_xy;
2449     const int mb_type= s->current_picture.mb_type[mb_xy];
2450     uint8_t  *dest_y, *dest_cb, *dest_cr;
2451     int linesize, uvlinesize /*dct_offset*/;
2452     int i;
2453     int *block_offset = &h->block_offset[0];
2454     const unsigned int bottom = mb_y & 1;
2455     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2456     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2457     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2458
2459     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2460     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2461     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2462
2463     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2464     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2465
2466     if (!simple && MB_FIELD) {
2467         linesize   = h->mb_linesize   = s->linesize * 2;
2468         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2469         block_offset = &h->block_offset[24];
2470         if(mb_y&1){ //FIXME move out of this function?
2471             dest_y -= s->linesize*15;
2472             dest_cb-= s->uvlinesize*7;
2473             dest_cr-= s->uvlinesize*7;
2474         }
2475         if(FRAME_MBAFF) {
2476             int list;
2477             for(list=0; list<h->list_count; list++){
2478                 if(!USES_LIST(mb_type, list))
2479                     continue;
2480                 if(IS_16X16(mb_type)){
2481                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2482                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2483                 }else{
2484                     for(i=0; i<16; i+=4){
2485                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2486                         int ref = h->ref_cache[list][scan8[i]];
2487                         if(ref >= 0)
2488                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2489                     }
2490                 }
2491             }
2492         }
2493     } else {
2494         linesize   = h->mb_linesize   = s->linesize;
2495         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2496 //        dct_offset = s->linesize * 16;
2497     }
2498
2499     if(transform_bypass){
2500         idct_dc_add =
2501         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2502     }else if(IS_8x8DCT(mb_type)){
2503         idct_dc_add = s->dsp.h264_idct8_dc_add;
2504         idct_add = s->dsp.h264_idct8_add;
2505     }else{
2506         idct_dc_add = s->dsp.h264_idct_dc_add;
2507         idct_add = s->dsp.h264_idct_add;
2508     }
2509
2510     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2511        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2512         int mbt_y = mb_y&~1;
2513         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2514         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2515         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2516         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2517     }
2518
2519     if (!simple && IS_INTRA_PCM(mb_type)) {
2520         for (i=0; i<16; i++) {
2521             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2522         }
2523         for (i=0; i<8; i++) {
2524             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2525             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2526         }
2527     } else {
2528         if(IS_INTRA(mb_type)){
2529             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2530                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2531
2532             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2533                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2534                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2535             }
2536
2537             if(IS_INTRA4x4(mb_type)){
2538                 if(simple || !s->encoding){
2539                     if(IS_8x8DCT(mb_type)){
2540                         for(i=0; i<16; i+=4){
2541                             uint8_t * const ptr= dest_y + block_offset[i];
2542                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2543                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2544                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2545                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2546                             if(nnz){
2547                                 if(nnz == 1 && h->mb[i*16])
2548                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2549                                 else
2550                                     idct_add(ptr, h->mb + i*16, linesize);
2551                             }
2552                         }
2553                     }else
2554                     for(i=0; i<16; i++){
2555                         uint8_t * const ptr= dest_y + block_offset[i];
2556                         uint8_t *topright;
2557                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2558                         int nnz, tr;
2559
2560                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2561                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2562                             assert(mb_y || linesize <= block_offset[i]);
2563                             if(!topright_avail){
2564                                 tr= ptr[3 - linesize]*0x01010101;
2565                                 topright= (uint8_t*) &tr;
2566                             }else
2567                                 topright= ptr + 4 - linesize;
2568                         }else
2569                             topright= NULL;
2570
2571                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2572                         nnz = h->non_zero_count_cache[ scan8[i] ];
2573                         if(nnz){
2574                             if(is_h264){
2575                                 if(nnz == 1 && h->mb[i*16])
2576                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2577                                 else
2578                                     idct_add(ptr, h->mb + i*16, linesize);
2579                             }else
2580                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2581                         }
2582                     }
2583                 }
2584             }else{
2585                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2586                 if(is_h264){
2587                     if(!transform_bypass)
2588                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2589                 }else
2590                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2591             }
2592             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2593                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2594         }else if(is_h264){
2595             hl_motion(h, dest_y, dest_cb, dest_cr,
2596                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2597                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2598                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2599         }
2600
2601
2602         if(!IS_INTRA4x4(mb_type)){
2603             if(is_h264){
2604                 if(IS_INTRA16x16(mb_type)){
2605                     for(i=0; i<16; i++){
2606                         if(h->non_zero_count_cache[ scan8[i] ])
2607                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2608                         else if(h->mb[i*16])
2609                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2610                     }
2611                 }else{
2612                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2613                     for(i=0; i<16; i+=di){
2614                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2615                         if(nnz){
2616                             if(nnz==1 && h->mb[i*16])
2617                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2618                             else
2619                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2620                         }
2621                     }
2622                 }
2623             }else{
2624                 for(i=0; i<16; i++){
2625                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2626                         uint8_t * const ptr= dest_y + block_offset[i];
2627                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2628                     }
2629                 }
2630             }
2631         }
2632
2633         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2634             uint8_t *dest[2] = {dest_cb, dest_cr};
2635             if(transform_bypass){
2636                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2637             }else{
2638                 idct_add = s->dsp.h264_idct_add;
2639                 idct_dc_add = s->dsp.h264_idct_dc_add;
2640                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2641                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2642             }
2643             if(is_h264){
2644                 for(i=16; i<16+8; i++){
2645                     if(h->non_zero_count_cache[ scan8[i] ])
2646                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2647                     else if(h->mb[i*16])
2648                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2649                 }
2650             }else{
2651                 for(i=16; i<16+8; i++){
2652                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2653                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2654                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2655                     }
2656                 }
2657             }
2658         }
2659     }
2660     if(h->deblocking_filter) {
2661         if (!simple && FRAME_MBAFF) {
2662             //FIXME try deblocking one mb at a time?
2663             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2664             const int mb_y = s->mb_y - 1;
2665             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2666             const int mb_xy= mb_x + mb_y*s->mb_stride;
2667             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2668             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2669             if (!bottom) return;
2670             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2671             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2672             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2673
2674             if(IS_INTRA(mb_type_top | mb_type_bottom))
2675                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2676
2677             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2678             // deblock a pair
2679             // top
2680             s->mb_y--; h->mb_xy -= s->mb_stride;
2681             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2682             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2683             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2684             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2685             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2686             // bottom
2687             s->mb_y++; h->mb_xy += s->mb_stride;
2688             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2689             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2690             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2691             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2692             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2693         } else {
2694             tprintf(h->s.avctx, "call filter_mb\n");
2695             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2696             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2697             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2698             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2699             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2700         }
2701     }
2702 }
2703
2704 /**
2705  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2706  */
2707 static void hl_decode_mb_simple(H264Context *h){
2708     hl_decode_mb_internal(h, 1);
2709 }
2710
2711 /**
2712  * Process a macroblock; this handles edge cases, such as interlacing.
2713  */
2714 static void av_noinline hl_decode_mb_complex(H264Context *h){
2715     hl_decode_mb_internal(h, 0);
2716 }
2717
2718 static void hl_decode_mb(H264Context *h){
2719     MpegEncContext * const s = &h->s;
2720     const int mb_xy= h->mb_xy;
2721     const int mb_type= s->current_picture.mb_type[mb_xy];
2722     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2723                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2724
2725     if(ENABLE_H264_ENCODER && !s->decode)
2726         return;
2727
2728     if (is_complex)
2729         hl_decode_mb_complex(h);
2730     else hl_decode_mb_simple(h);
2731 }
2732
2733 static void pic_as_field(Picture *pic, const int parity){
2734     int i;
2735     for (i = 0; i < 4; ++i) {
2736         if (parity == PICT_BOTTOM_FIELD)
2737             pic->data[i] += pic->linesize[i];
2738         pic->reference = parity;
2739         pic->linesize[i] *= 2;
2740     }
2741     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2742 }
2743
2744 static int split_field_copy(Picture *dest, Picture *src,
2745                             int parity, int id_add){
2746     int match = !!(src->reference & parity);
2747
2748     if (match) {
2749         *dest = *src;
2750         if(parity != PICT_FRAME){
2751             pic_as_field(dest, parity);
2752             dest->pic_id *= 2;
2753             dest->pic_id += id_add;
2754         }
2755     }
2756
2757     return match;
2758 }
2759
2760 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2761     int i[2]={0};
2762     int index=0;
2763
2764     while(i[0]<len || i[1]<len){
2765         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2766             i[0]++;
2767         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2768             i[1]++;
2769         if(i[0] < len){
2770             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2771             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2772         }
2773         if(i[1] < len){
2774             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2775             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2776         }
2777     }
2778
2779     return index;
2780 }
2781
2782 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2783     int i, best_poc;
2784     int out_i= 0;
2785
2786     for(;;){
2787         best_poc= dir ? INT_MIN : INT_MAX;
2788
2789         for(i=0; i<len; i++){
2790             const int poc= src[i]->poc;
2791             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2792                 best_poc= poc;
2793                 sorted[out_i]= src[i];
2794             }
2795         }
2796         if(best_poc == (dir ? INT_MIN : INT_MAX))
2797             break;
2798         limit= sorted[out_i++]->poc - dir;
2799     }
2800     return out_i;
2801 }
2802
2803 /**
2804  * fills the default_ref_list.
2805  */
2806 static int fill_default_ref_list(H264Context *h){
2807     MpegEncContext * const s = &h->s;
2808     int i, len;
2809
2810     if(h->slice_type_nos==FF_B_TYPE){
2811         Picture *sorted[32];
2812         int cur_poc, list;
2813         int lens[2];
2814
2815         if(FIELD_PICTURE)
2816             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2817         else
2818             cur_poc= s->current_picture_ptr->poc;
2819
2820         for(list= 0; list<2; list++){
2821             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2822             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2823             assert(len<=32);
2824             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2825             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2826             assert(len<=32);
2827
2828             if(len < h->ref_count[list])
2829                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2830             lens[list]= len;
2831         }
2832
2833         if(lens[0] == lens[1] && lens[1] > 1){
2834             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2835             if(i == lens[0])
2836                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2837         }
2838     }else{
2839         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2840         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2841         assert(len <= 32);
2842         if(len < h->ref_count[0])
2843             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2844     }
2845 #ifdef TRACE
2846     for (i=0; i<h->ref_count[0]; i++) {
2847         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2848     }
2849     if(h->slice_type_nos==FF_B_TYPE){
2850         for (i=0; i<h->ref_count[1]; i++) {
2851             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2852         }
2853     }
2854 #endif
2855     return 0;
2856 }
2857
2858 static void print_short_term(H264Context *h);
2859 static void print_long_term(H264Context *h);
2860
2861 /**
2862  * Extract structure information about the picture described by pic_num in
2863  * the current decoding context (frame or field). Note that pic_num is
2864  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2865  * @param pic_num picture number for which to extract structure information
2866  * @param structure one of PICT_XXX describing structure of picture
2867  *                      with pic_num
2868  * @return frame number (short term) or long term index of picture
2869  *         described by pic_num
2870  */
2871 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2872     MpegEncContext * const s = &h->s;
2873
2874     *structure = s->picture_structure;
2875     if(FIELD_PICTURE){
2876         if (!(pic_num & 1))
2877             /* opposite field */
2878             *structure ^= PICT_FRAME;
2879         pic_num >>= 1;
2880     }
2881
2882     return pic_num;
2883 }
2884
2885 static int decode_ref_pic_list_reordering(H264Context *h){
2886     MpegEncContext * const s = &h->s;
2887     int list, index, pic_structure;
2888
2889     print_short_term(h);
2890     print_long_term(h);
2891
2892     for(list=0; list<h->list_count; list++){
2893         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2894
2895         if(get_bits1(&s->gb)){
2896             int pred= h->curr_pic_num;
2897
2898             for(index=0; ; index++){
2899                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2900                 unsigned int pic_id;
2901                 int i;
2902                 Picture *ref = NULL;
2903
2904                 if(reordering_of_pic_nums_idc==3)
2905                     break;
2906
2907                 if(index >= h->ref_count[list]){
2908                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2909                     return -1;
2910                 }
2911
2912                 if(reordering_of_pic_nums_idc<3){
2913                     if(reordering_of_pic_nums_idc<2){
2914                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2915                         int frame_num;
2916
2917                         if(abs_diff_pic_num > h->max_pic_num){
2918                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2919                             return -1;
2920                         }
2921
2922                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2923                         else                                pred+= abs_diff_pic_num;
2924                         pred &= h->max_pic_num - 1;
2925
2926                         frame_num = pic_num_extract(h, pred, &pic_structure);
2927
2928                         for(i= h->short_ref_count-1; i>=0; i--){
2929                             ref = h->short_ref[i];
2930                             assert(ref->reference);
2931                             assert(!ref->long_ref);
2932                             if(
2933                                    ref->frame_num == frame_num &&
2934                                    (ref->reference & pic_structure)
2935                               )
2936                                 break;
2937                         }
2938                         if(i>=0)
2939                             ref->pic_id= pred;
2940                     }else{
2941                         int long_idx;
2942                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2943
2944                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2945
2946                         if(long_idx>31){
2947                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2948                             return -1;
2949                         }
2950                         ref = h->long_ref[long_idx];
2951                         assert(!(ref && !ref->reference));
2952                         if(ref && (ref->reference & pic_structure)){
2953                             ref->pic_id= pic_id;
2954                             assert(ref->long_ref);
2955                             i=0;
2956                         }else{
2957                             i=-1;
2958                         }
2959                     }
2960
2961                     if (i < 0) {
2962                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2963                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2964                     } else {
2965                         for(i=index; i+1<h->ref_count[list]; i++){
2966                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2967                                 break;
2968                         }
2969                         for(; i > index; i--){
2970                             h->ref_list[list][i]= h->ref_list[list][i-1];
2971                         }
2972                         h->ref_list[list][index]= *ref;
2973                         if (FIELD_PICTURE){
2974                             pic_as_field(&h->ref_list[list][index], pic_structure);
2975                         }
2976                     }
2977                 }else{
2978                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2979                     return -1;
2980                 }
2981             }
2982         }
2983     }
2984     for(list=0; list<h->list_count; list++){
2985         for(index= 0; index < h->ref_count[list]; index++){
2986             if(!h->ref_list[list][index].data[0]){
2987                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2988                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2989             }
2990         }
2991     }
2992
2993     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
2994         direct_dist_scale_factor(h);
2995     direct_ref_list_init(h);
2996     return 0;
2997 }
2998
2999 static void fill_mbaff_ref_list(H264Context *h){
3000     int list, i, j;
3001     for(list=0; list<2; list++){ //FIXME try list_count
3002         for(i=0; i<h->ref_count[list]; i++){
3003             Picture *frame = &h->ref_list[list][i];
3004             Picture *field = &h->ref_list[list][16+2*i];
3005             field[0] = *frame;
3006             for(j=0; j<3; j++)
3007                 field[0].linesize[j] <<= 1;
3008             field[0].reference = PICT_TOP_FIELD;
3009             field[1] = field[0];
3010             for(j=0; j<3; j++)
3011                 field[1].data[j] += frame->linesize[j];
3012             field[1].reference = PICT_BOTTOM_FIELD;
3013
3014             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3015             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3016             for(j=0; j<2; j++){
3017                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3018                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3019             }
3020         }
3021     }
3022     for(j=0; j<h->ref_count[1]; j++){
3023         for(i=0; i<h->ref_count[0]; i++)
3024             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3025         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3026         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3027     }
3028 }
3029
3030 static int pred_weight_table(H264Context *h){
3031     MpegEncContext * const s = &h->s;
3032     int list, i;
3033     int luma_def, chroma_def;
3034
3035     h->use_weight= 0;
3036     h->use_weight_chroma= 0;
3037     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3038     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3039     luma_def = 1<<h->luma_log2_weight_denom;
3040     chroma_def = 1<<h->chroma_log2_weight_denom;
3041
3042     for(list=0; list<2; list++){
3043         for(i=0; i<h->ref_count[list]; i++){
3044             int luma_weight_flag, chroma_weight_flag;
3045
3046             luma_weight_flag= get_bits1(&s->gb);
3047             if(luma_weight_flag){
3048                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3049                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3050                 if(   h->luma_weight[list][i] != luma_def
3051                    || h->luma_offset[list][i] != 0)
3052                     h->use_weight= 1;
3053             }else{
3054                 h->luma_weight[list][i]= luma_def;
3055                 h->luma_offset[list][i]= 0;
3056             }
3057
3058             if(CHROMA){
3059                 chroma_weight_flag= get_bits1(&s->gb);
3060                 if(chroma_weight_flag){
3061                     int j;
3062                     for(j=0; j<2; j++){
3063                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3064                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3065                         if(   h->chroma_weight[list][i][j] != chroma_def
3066                         || h->chroma_offset[list][i][j] != 0)
3067                             h->use_weight_chroma= 1;
3068                     }
3069                 }else{
3070                     int j;
3071                     for(j=0; j<2; j++){
3072                         h->chroma_weight[list][i][j]= chroma_def;
3073                         h->chroma_offset[list][i][j]= 0;
3074                     }
3075                 }
3076             }
3077         }
3078         if(h->slice_type_nos != FF_B_TYPE) break;
3079     }
3080     h->use_weight= h->use_weight || h->use_weight_chroma;
3081     return 0;
3082 }
3083
3084 static void implicit_weight_table(H264Context *h){
3085     MpegEncContext * const s = &h->s;
3086     int ref0, ref1;
3087     int cur_poc = s->current_picture_ptr->poc;
3088
3089     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3090        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3091         h->use_weight= 0;
3092         h->use_weight_chroma= 0;
3093         return;
3094     }
3095
3096     h->use_weight= 2;
3097     h->use_weight_chroma= 2;
3098     h->luma_log2_weight_denom= 5;
3099     h->chroma_log2_weight_denom= 5;
3100
3101     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3102         int poc0 = h->ref_list[0][ref0].poc;
3103         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3104             int poc1 = h->ref_list[1][ref1].poc;
3105             int td = av_clip(poc1 - poc0, -128, 127);
3106             if(td){
3107                 int tb = av_clip(cur_poc - poc0, -128, 127);
3108                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3109                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3110                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3111                     h->implicit_weight[ref0][ref1] = 32;
3112                 else
3113                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3114             }else
3115                 h->implicit_weight[ref0][ref1] = 32;
3116         }
3117     }
3118 }
3119
3120 /**
3121  * Mark a picture as no longer needed for reference. The refmask
3122  * argument allows unreferencing of individual fields or the whole frame.
3123  * If the picture becomes entirely unreferenced, but is being held for
3124  * display purposes, it is marked as such.
3125  * @param refmask mask of fields to unreference; the mask is bitwise
3126  *                anded with the reference marking of pic
3127  * @return non-zero if pic becomes entirely unreferenced (except possibly
3128  *         for display purposes) zero if one of the fields remains in
3129  *         reference
3130  */
3131 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3132     int i;
3133     if (pic->reference &= refmask) {
3134         return 0;
3135     } else {
3136         for(i = 0; h->delayed_pic[i]; i++)
3137             if(pic == h->delayed_pic[i]){
3138                 pic->reference=DELAYED_PIC_REF;
3139                 break;
3140             }
3141         return 1;
3142     }
3143 }
3144
3145 /**
3146  * instantaneous decoder refresh.
3147  */
3148 static void idr(H264Context *h){
3149     int i;
3150
3151     for(i=0; i<16; i++){
3152         remove_long(h, i, 0);
3153     }
3154     assert(h->long_ref_count==0);
3155
3156     for(i=0; i<h->short_ref_count; i++){
3157         unreference_pic(h, h->short_ref[i], 0);
3158         h->short_ref[i]= NULL;
3159     }
3160     h->short_ref_count=0;
3161     h->prev_frame_num= 0;
3162     h->prev_frame_num_offset= 0;
3163     h->prev_poc_msb=
3164     h->prev_poc_lsb= 0;
3165 }
3166
3167 /* forget old pics after a seek */
3168 static void flush_dpb(AVCodecContext *avctx){
3169     H264Context *h= avctx->priv_data;
3170     int i;
3171     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3172         if(h->delayed_pic[i])
3173             h->delayed_pic[i]->reference= 0;
3174         h->delayed_pic[i]= NULL;
3175     }
3176     h->outputed_poc= INT_MIN;
3177     idr(h);
3178     if(h->s.current_picture_ptr)
3179         h->s.current_picture_ptr->reference= 0;
3180     h->s.first_field= 0;
3181     ff_mpeg_flush(avctx);
3182 }
3183
3184 /**
3185  * Find a Picture in the short term reference list by frame number.
3186  * @param frame_num frame number to search for
3187  * @param idx the index into h->short_ref where returned picture is found
3188  *            undefined if no picture found.
3189  * @return pointer to the found picture, or NULL if no pic with the provided
3190  *                 frame number is found
3191  */
3192 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3193     MpegEncContext * const s = &h->s;
3194     int i;
3195
3196     for(i=0; i<h->short_ref_count; i++){
3197         Picture *pic= h->short_ref[i];
3198         if(s->avctx->debug&FF_DEBUG_MMCO)
3199             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3200         if(pic->frame_num == frame_num) {
3201             *idx = i;
3202             return pic;
3203         }
3204     }
3205     return NULL;
3206 }
3207
3208 /**
3209  * Remove a picture from the short term reference list by its index in
3210  * that list.  This does no checking on the provided index; it is assumed
3211  * to be valid. Other list entries are shifted down.
3212  * @param i index into h->short_ref of picture to remove.
3213  */
3214 static void remove_short_at_index(H264Context *h, int i){
3215     assert(i >= 0 && i < h->short_ref_count);
3216     h->short_ref[i]= NULL;
3217     if (--h->short_ref_count)
3218         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3219 }
3220
3221 /**
3222  *
3223  * @return the removed picture or NULL if an error occurs
3224  */
3225 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3226     MpegEncContext * const s = &h->s;
3227     Picture *pic;
3228     int i;
3229
3230     if(s->avctx->debug&FF_DEBUG_MMCO)
3231         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3232
3233     pic = find_short(h, frame_num, &i);
3234     if (pic){
3235         if(unreference_pic(h, pic, ref_mask))
3236         remove_short_at_index(h, i);
3237     }
3238
3239     return pic;
3240 }
3241
3242 /**
3243  * Remove a picture from the long term reference list by its index in
3244  * that list.
3245  * @return the removed picture or NULL if an error occurs
3246  */
3247 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3248     Picture *pic;
3249
3250     pic= h->long_ref[i];
3251     if (pic){
3252         if(unreference_pic(h, pic, ref_mask)){
3253             assert(h->long_ref[i]->long_ref == 1);
3254             h->long_ref[i]->long_ref= 0;
3255             h->long_ref[i]= NULL;
3256             h->long_ref_count--;
3257         }
3258     }
3259
3260     return pic;
3261 }
3262
3263 /**
3264  * print short term list
3265  */
3266 static void print_short_term(H264Context *h) {
3267     uint32_t i;
3268     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3269         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3270         for(i=0; i<h->short_ref_count; i++){
3271             Picture *pic= h->short_ref[i];
3272             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3273         }
3274     }
3275 }
3276
3277 /**
3278  * print long term list
3279  */
3280 static void print_long_term(H264Context *h) {
3281     uint32_t i;
3282     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3283         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3284         for(i = 0; i < 16; i++){
3285             Picture *pic= h->long_ref[i];
3286             if (pic) {
3287                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3288             }
3289         }
3290     }
3291 }
3292
3293 /**
3294  * Executes the reference picture marking (memory management control operations).
3295  */
3296 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3297     MpegEncContext * const s = &h->s;
3298     int i, j;
3299     int current_ref_assigned=0;
3300     Picture *pic;
3301
3302     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3303         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3304
3305     for(i=0; i<mmco_count; i++){
3306         int structure, frame_num;
3307         if(s->avctx->debug&FF_DEBUG_MMCO)
3308             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3309
3310         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3311            || mmco[i].opcode == MMCO_SHORT2LONG){
3312             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3313             pic = find_short(h, frame_num, &j);
3314             if(!pic){
3315                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3316                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3317                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3318                 continue;
3319             }
3320         }
3321
3322         switch(mmco[i].opcode){
3323         case MMCO_SHORT2UNUSED:
3324             if(s->avctx->debug&FF_DEBUG_MMCO)
3325                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3326             remove_short(h, frame_num, structure ^ PICT_FRAME);
3327             break;
3328         case MMCO_SHORT2LONG:
3329                 if (h->long_ref[mmco[i].long_arg] != pic)
3330                     remove_long(h, mmco[i].long_arg, 0);
3331
3332                 remove_short_at_index(h, j);
3333                 h->long_ref[ mmco[i].long_arg ]= pic;
3334                 if (h->long_ref[ mmco[i].long_arg ]){
3335                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3336                     h->long_ref_count++;
3337                 }
3338             break;
3339         case MMCO_LONG2UNUSED:
3340             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3341             pic = h->long_ref[j];
3342             if (pic) {
3343                 remove_long(h, j, structure ^ PICT_FRAME);
3344             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3345                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3346             break;
3347         case MMCO_LONG:
3348                     // Comment below left from previous code as it is an interresting note.
3349                     /* First field in pair is in short term list or
3350                      * at a different long term index.
3351                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3352                      * Report the problem and keep the pair where it is,
3353                      * and mark this field valid.
3354                      */
3355
3356             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3357                 remove_long(h, mmco[i].long_arg, 0);
3358
3359                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3360                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3361                 h->long_ref_count++;
3362             }
3363
3364             s->current_picture_ptr->reference |= s->picture_structure;
3365             current_ref_assigned=1;
3366             break;
3367         case MMCO_SET_MAX_LONG:
3368             assert(mmco[i].long_arg <= 16);
3369             // just remove the long term which index is greater than new max
3370             for(j = mmco[i].long_arg; j<16; j++){
3371                 remove_long(h, j, 0);
3372             }
3373             break;
3374         case MMCO_RESET:
3375             while(h->short_ref_count){
3376                 remove_short(h, h->short_ref[0]->frame_num, 0);
3377             }
3378             for(j = 0; j < 16; j++) {
3379                 remove_long(h, j, 0);
3380             }
3381             s->current_picture_ptr->poc=
3382             s->current_picture_ptr->field_poc[0]=
3383             s->current_picture_ptr->field_poc[1]=
3384             h->poc_lsb=
3385             h->poc_msb=
3386             h->frame_num=
3387             s->current_picture_ptr->frame_num= 0;
3388             break;
3389         default: assert(0);
3390         }
3391     }
3392
3393     if (!current_ref_assigned) {
3394         /* Second field of complementary field pair; the first field of
3395          * which is already referenced. If short referenced, it
3396          * should be first entry in short_ref. If not, it must exist
3397          * in long_ref; trying to put it on the short list here is an
3398          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3399          */
3400         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3401             /* Just mark the second field valid */
3402             s->current_picture_ptr->reference = PICT_FRAME;
3403         } else if (s->current_picture_ptr->long_ref) {
3404             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3405                                              "assignment for second field "
3406                                              "in complementary field pair "
3407                                              "(first field is long term)\n");
3408         } else {
3409             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3410             if(pic){
3411                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3412             }
3413
3414             if(h->short_ref_count)
3415                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3416
3417             h->short_ref[0]= s->current_picture_ptr;
3418             h->short_ref_count++;
3419             s->current_picture_ptr->reference |= s->picture_structure;
3420         }
3421     }
3422
3423     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3424
3425         /* We have too many reference frames, probably due to corrupted
3426          * stream. Need to discard one frame. Prevents overrun of the
3427          * short_ref and long_ref buffers.
3428          */
3429         av_log(h->s.avctx, AV_LOG_ERROR,
3430                "number of reference frames exceeds max (probably "
3431                "corrupt input), discarding one\n");
3432
3433         if (h->long_ref_count && !h->short_ref_count) {
3434             for (i = 0; i < 16; ++i)
3435                 if (h->long_ref[i])
3436                     break;
3437
3438             assert(i < 16);
3439             remove_long(h, i, 0);
3440         } else {
3441             pic = h->short_ref[h->short_ref_count - 1];
3442             remove_short(h, pic->frame_num, 0);
3443         }
3444     }
3445
3446     print_short_term(h);
3447     print_long_term(h);
3448     return 0;
3449 }
3450
3451 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3452     MpegEncContext * const s = &h->s;
3453     int i;
3454
3455     h->mmco_index= 0;
3456     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3457         s->broken_link= get_bits1(gb) -1;
3458         if(get_bits1(gb)){
3459             h->mmco[0].opcode= MMCO_LONG;
3460             h->mmco[0].long_arg= 0;
3461             h->mmco_index= 1;
3462         }
3463     }else{
3464         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3465             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3466                 MMCOOpcode opcode= get_ue_golomb(gb);
3467
3468                 h->mmco[i].opcode= opcode;
3469                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3470                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3471 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3472                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3473                         return -1;
3474                     }*/
3475                 }
3476                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3477                     unsigned int long_arg= get_ue_golomb(gb);
3478                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3479                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3480                         return -1;
3481                     }
3482                     h->mmco[i].long_arg= long_arg;
3483                 }
3484
3485                 if(opcode > (unsigned)MMCO_LONG){
3486                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3487                     return -1;
3488                 }
3489                 if(opcode == MMCO_END)
3490                     break;
3491             }
3492             h->mmco_index= i;
3493         }else{
3494             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3495
3496             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3497                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3498                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3499                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3500                 h->mmco_index= 1;
3501                 if (FIELD_PICTURE) {
3502                     h->mmco[0].short_pic_num *= 2;
3503                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3504                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3505                     h->mmco_index= 2;
3506                 }
3507             }
3508         }
3509     }
3510
3511     return 0;
3512 }
3513
3514 static int init_poc(H264Context *h){
3515     MpegEncContext * const s = &h->s;
3516     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3517     int field_poc[2];
3518     Picture *cur = s->current_picture_ptr;
3519
3520     h->frame_num_offset= h->prev_frame_num_offset;
3521     if(h->frame_num < h->prev_frame_num)
3522         h->frame_num_offset += max_frame_num;
3523
3524     if(h->sps.poc_type==0){
3525         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3526
3527         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3528             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3529         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3530             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3531         else
3532             h->poc_msb = h->prev_poc_msb;
3533 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3534         field_poc[0] =
3535         field_poc[1] = h->poc_msb + h->poc_lsb;
3536         if(s->picture_structure == PICT_FRAME)
3537             field_poc[1] += h->delta_poc_bottom;
3538     }else if(h->sps.poc_type==1){
3539         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3540         int i;
3541
3542         if(h->sps.poc_cycle_length != 0)
3543             abs_frame_num = h->frame_num_offset + h->frame_num;
3544         else
3545             abs_frame_num = 0;
3546
3547         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3548             abs_frame_num--;
3549
3550         expected_delta_per_poc_cycle = 0;
3551         for(i=0; i < h->sps.poc_cycle_length; i++)
3552             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3553
3554         if(abs_frame_num > 0){
3555             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3556             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3557
3558             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3559             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3560                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3561         } else
3562             expectedpoc = 0;
3563
3564         if(h->nal_ref_idc == 0)
3565             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3566
3567         field_poc[0] = expectedpoc + h->delta_poc[0];
3568         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3569
3570         if(s->picture_structure == PICT_FRAME)
3571             field_poc[1] += h->delta_poc[1];
3572     }else{
3573         int poc= 2*(h->frame_num_offset + h->frame_num);
3574
3575         if(!h->nal_ref_idc)
3576             poc--;
3577
3578         field_poc[0]= poc;
3579         field_poc[1]= poc;
3580     }
3581
3582     if(s->picture_structure != PICT_BOTTOM_FIELD)
3583         s->current_picture_ptr->field_poc[0]= field_poc[0];
3584     if(s->picture_structure != PICT_TOP_FIELD)
3585         s->current_picture_ptr->field_poc[1]= field_poc[1];
3586     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3587
3588     return 0;
3589 }
3590
3591
3592 /**
3593  * initialize scan tables
3594  */
3595 static void init_scan_tables(H264Context *h){
3596     MpegEncContext * const s = &h->s;
3597     int i;
3598     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3599         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3600         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3601     }else{
3602         for(i=0; i<16; i++){
3603 #define T(x) (x>>2) | ((x<<2) & 0xF)
3604             h->zigzag_scan[i] = T(zigzag_scan[i]);
3605             h-> field_scan[i] = T( field_scan[i]);
3606 #undef T
3607         }
3608     }
3609     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3610         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3611         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3612         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3613         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3614     }else{
3615         for(i=0; i<64; i++){
3616 #define T(x) (x>>3) | ((x&7)<<3)
3617             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3618             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3619             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3620             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3621 #undef T
3622         }
3623     }
3624     if(h->sps.transform_bypass){ //FIXME same ugly
3625         h->zigzag_scan_q0          = zigzag_scan;
3626         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3627         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3628         h->field_scan_q0           = field_scan;
3629         h->field_scan8x8_q0        = field_scan8x8;
3630         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3631     }else{
3632         h->zigzag_scan_q0          = h->zigzag_scan;
3633         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3634         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3635         h->field_scan_q0           = h->field_scan;
3636         h->field_scan8x8_q0        = h->field_scan8x8;
3637         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3638     }
3639 }
3640
3641 /**
3642  * Replicates H264 "master" context to thread contexts.
3643  */
3644 static void clone_slice(H264Context *dst, H264Context *src)
3645 {
3646     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3647     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3648     dst->s.current_picture      = src->s.current_picture;
3649     dst->s.linesize             = src->s.linesize;
3650     dst->s.uvlinesize           = src->s.uvlinesize;
3651     dst->s.first_field          = src->s.first_field;
3652
3653     dst->prev_poc_msb           = src->prev_poc_msb;
3654     dst->prev_poc_lsb           = src->prev_poc_lsb;
3655     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3656     dst->prev_frame_num         = src->prev_frame_num;
3657     dst->short_ref_count        = src->short_ref_count;
3658
3659     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3660     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3661     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3662     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3663
3664     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3665     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3666 }
3667
3668 /**
3669  * decodes a slice header.
3670  * This will also call MPV_common_init() and frame_start() as needed.
3671  *
3672  * @param h h264context
3673  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3674  *
3675  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3676  */
3677 static int decode_slice_header(H264Context *h, H264Context *h0){
3678     MpegEncContext * const s = &h->s;
3679     MpegEncContext * const s0 = &h0->s;
3680     unsigned int first_mb_in_slice;
3681     unsigned int pps_id;
3682     int num_ref_idx_active_override_flag;
3683     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3684     unsigned int slice_type, tmp, i, j;
3685     int default_ref_list_done = 0;
3686     int last_pic_structure;
3687
3688     s->dropable= h->nal_ref_idc == 0;
3689
3690     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3691         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3692         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3693     }else{
3694         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3695         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3696     }
3697
3698     first_mb_in_slice= get_ue_golomb(&s->gb);
3699
3700     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3701         h0->current_slice = 0;
3702         if (!s0->first_field)
3703             s->current_picture_ptr= NULL;
3704     }
3705
3706     slice_type= get_ue_golomb(&s->gb);
3707     if(slice_type > 9){
3708         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3709         return -1;
3710     }
3711     if(slice_type > 4){
3712         slice_type -= 5;
3713         h->slice_type_fixed=1;
3714     }else
3715         h->slice_type_fixed=0;
3716
3717     slice_type= slice_type_map[ slice_type ];
3718     if (slice_type == FF_I_TYPE
3719         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3720         default_ref_list_done = 1;
3721     }
3722     h->slice_type= slice_type;
3723     h->slice_type_nos= slice_type & 3;
3724
3725     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3726     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3727         av_log(h->s.avctx, AV_LOG_ERROR,
3728                "B picture before any references, skipping\n");
3729         return -1;
3730     }
3731
3732     pps_id= get_ue_golomb(&s->gb);
3733     if(pps_id>=MAX_PPS_COUNT){
3734         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3735         return -1;
3736     }
3737     if(!h0->pps_buffers[pps_id]) {
3738         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3739         return -1;
3740     }
3741     h->pps= *h0->pps_buffers[pps_id];
3742
3743     if(!h0->sps_buffers[h->pps.sps_id]) {
3744         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3745         return -1;
3746     }
3747     h->sps = *h0->sps_buffers[h->pps.sps_id];
3748
3749     if(h == h0 && h->dequant_coeff_pps != pps_id){
3750         h->dequant_coeff_pps = pps_id;
3751         init_dequant_tables(h);
3752     }
3753
3754     s->mb_width= h->sps.mb_width;
3755     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3756
3757     h->b_stride=  s->mb_width*4;
3758     h->b8_stride= s->mb_width*2;
3759
3760     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3761     if(h->sps.frame_mbs_only_flag)
3762         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3763     else
3764         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3765
3766     if (s->context_initialized
3767         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3768         if(h != h0)
3769             return -1;   // width / height changed during parallelized decoding
3770         free_tables(h);
3771         MPV_common_end(s);
3772     }
3773     if (!s->context_initialized) {
3774         if(h != h0)
3775             return -1;  // we cant (re-)initialize context during parallel decoding
3776         if (MPV_common_init(s) < 0)
3777             return -1;
3778         s->first_field = 0;
3779
3780         init_scan_tables(h);
3781         alloc_tables(h);
3782
3783         for(i = 1; i < s->avctx->thread_count; i++) {
3784             H264Context *c;
3785             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3786             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3787             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3788             c->sps = h->sps;
3789             c->pps = h->pps;
3790             init_scan_tables(c);
3791             clone_tables(c, h);
3792         }
3793
3794         for(i = 0; i < s->avctx->thread_count; i++)
3795             if(context_init(h->thread_context[i]) < 0)
3796                 return -1;
3797
3798         s->avctx->width = s->width;
3799         s->avctx->height = s->height;
3800         s->avctx->sample_aspect_ratio= h->sps.sar;
3801         if(!s->avctx->sample_aspect_ratio.den)
3802             s->avctx->sample_aspect_ratio.den = 1;
3803
3804         if(h->sps.timing_info_present_flag){
3805             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3806             if(h->x264_build > 0 && h->x264_build < 44)
3807                 s->avctx->time_base.den *= 2;
3808             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3809                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3810         }
3811     }
3812
3813     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3814
3815     h->mb_mbaff = 0;
3816     h->mb_aff_frame = 0;
3817     last_pic_structure = s0->picture_structure;
3818     if(h->sps.frame_mbs_only_flag){
3819         s->picture_structure= PICT_FRAME;
3820     }else{
3821         if(get_bits1(&s->gb)) { //field_pic_flag
3822             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3823         } else {
3824             s->picture_structure= PICT_FRAME;
3825             h->mb_aff_frame = h->sps.mb_aff;
3826         }
3827     }
3828     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3829
3830     if(h0->current_slice == 0){
3831         while(h->frame_num !=  h->prev_frame_num &&
3832               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3833             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3834             frame_start(h);
3835             h->prev_frame_num++;
3836             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3837             s->current_picture_ptr->frame_num= h->prev_frame_num;
3838             execute_ref_pic_marking(h, NULL, 0);
3839         }
3840
3841         /* See if we have a decoded first field looking for a pair... */
3842         if (s0->first_field) {
3843             assert(s0->current_picture_ptr);
3844             assert(s0->current_picture_ptr->data[0]);
3845             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3846
3847             /* figure out if we have a complementary field pair */
3848             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3849                 /*
3850                  * Previous field is unmatched. Don't display it, but let it
3851                  * remain for reference if marked as such.
3852                  */
3853                 s0->current_picture_ptr = NULL;
3854                 s0->first_field = FIELD_PICTURE;
3855
3856             } else {
3857                 if (h->nal_ref_idc &&
3858                         s0->current_picture_ptr->reference &&
3859                         s0->current_picture_ptr->frame_num != h->frame_num) {
3860                     /*
3861                      * This and previous field were reference, but had
3862                      * different frame_nums. Consider this field first in
3863                      * pair. Throw away previous field except for reference
3864                      * purposes.
3865                      */
3866                     s0->first_field = 1;
3867                     s0->current_picture_ptr = NULL;
3868
3869                 } else {
3870                     /* Second field in complementary pair */
3871                     s0->first_field = 0;
3872                 }
3873             }
3874
3875         } else {
3876             /* Frame or first field in a potentially complementary pair */
3877             assert(!s0->current_picture_ptr);
3878             s0->first_field = FIELD_PICTURE;
3879         }
3880
3881         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3882             s0->first_field = 0;
3883             return -1;
3884         }
3885     }
3886     if(h != h0)
3887         clone_slice(h, h0);
3888
3889     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3890
3891     assert(s->mb_num == s->mb_width * s->mb_height);
3892     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3893        first_mb_in_slice                    >= s->mb_num){
3894         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3895         return -1;
3896     }
3897     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3898     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3899     if (s->picture_structure == PICT_BOTTOM_FIELD)
3900         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3901     assert(s->mb_y < s->mb_height);
3902
3903     if(s->picture_structure==PICT_FRAME){
3904         h->curr_pic_num=   h->frame_num;
3905         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3906     }else{
3907         h->curr_pic_num= 2*h->frame_num + 1;
3908         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3909     }
3910
3911     if(h->nal_unit_type == NAL_IDR_SLICE){
3912         get_ue_golomb(&s->gb); /* idr_pic_id */
3913     }
3914
3915     if(h->sps.poc_type==0){
3916         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3917
3918         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3919             h->delta_poc_bottom= get_se_golomb(&s->gb);
3920         }
3921     }
3922
3923     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3924         h->delta_poc[0]= get_se_golomb(&s->gb);
3925
3926         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3927             h->delta_poc[1]= get_se_golomb(&s->gb);
3928     }
3929
3930     init_poc(h);
3931
3932     if(h->pps.redundant_pic_cnt_present){
3933         h->redundant_pic_count= get_ue_golomb(&s->gb);
3934     }
3935
3936     //set defaults, might be overridden a few lines later
3937     h->ref_count[0]= h->pps.ref_count[0];
3938     h->ref_count[1]= h->pps.ref_count[1];
3939
3940     if(h->slice_type_nos != FF_I_TYPE){
3941         if(h->slice_type_nos == FF_B_TYPE){
3942             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3943         }
3944         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3945
3946         if(num_ref_idx_active_override_flag){
3947             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3948             if(h->slice_type_nos==FF_B_TYPE)
3949                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3950
3951             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3952                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3953                 h->ref_count[0]= h->ref_count[1]= 1;
3954                 return -1;
3955             }
3956         }
3957         if(h->slice_type_nos == FF_B_TYPE)
3958             h->list_count= 2;
3959         else
3960             h->list_count= 1;
3961     }else
3962         h->list_count= 0;
3963
3964     if(!default_ref_list_done){
3965         fill_default_ref_list(h);
3966     }
3967
3968     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3969         return -1;
3970
3971     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3972        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3973         pred_weight_table(h);
3974     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3975         implicit_weight_table(h);
3976     else
3977         h->use_weight = 0;
3978
3979     if(h->nal_ref_idc)
3980         decode_ref_pic_marking(h0, &s->gb);
3981
3982     if(FRAME_MBAFF)
3983         fill_mbaff_ref_list(h);
3984
3985     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3986         tmp = get_ue_golomb(&s->gb);
3987         if(tmp > 2){
3988             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3989             return -1;
3990         }
3991         h->cabac_init_idc= tmp;
3992     }
3993
3994     h->last_qscale_diff = 0;
3995     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3996     if(tmp>51){
3997         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3998         return -1;
3999     }
4000     s->qscale= tmp;
4001     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4002     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4003     //FIXME qscale / qp ... stuff
4004     if(h->slice_type == FF_SP_TYPE){
4005         get_bits1(&s->gb); /* sp_for_switch_flag */
4006     }
4007     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4008         get_se_golomb(&s->gb); /* slice_qs_delta */
4009     }
4010
4011     h->deblocking_filter = 1;
4012     h->slice_alpha_c0_offset = 0;
4013     h->slice_beta_offset = 0;
4014     if( h->pps.deblocking_filter_parameters_present ) {
4015         tmp= get_ue_golomb(&s->gb);
4016         if(tmp > 2){
4017             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4018             return -1;
4019         }
4020         h->deblocking_filter= tmp;
4021         if(h->deblocking_filter < 2)
4022             h->deblocking_filter^= 1; // 1<->0
4023
4024         if( h->deblocking_filter ) {
4025             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4026             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4027         }
4028     }
4029
4030     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4031        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4032        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4033        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4034         h->deblocking_filter= 0;
4035
4036     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4037         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4038             /* Cheat slightly for speed:
4039                Do not bother to deblock across slices. */
4040             h->deblocking_filter = 2;
4041         } else {
4042             h0->max_contexts = 1;
4043             if(!h0->single_decode_warning) {
4044                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4045                 h0->single_decode_warning = 1;
4046             }
4047             if(h != h0)
4048                 return 1; // deblocking switched inside frame
4049         }
4050     }
4051
4052 #if 0 //FMO
4053     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4054         slice_group_change_cycle= get_bits(&s->gb, ?);
4055 #endif
4056
4057     h0->last_slice_type = slice_type;
4058     h->slice_num = ++h0->current_slice;
4059
4060     for(j=0; j<2; j++){
4061         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4062         ref2frm[0]=
4063         ref2frm[1]= -1;
4064         for(i=0; i<48; i++)
4065             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4066                           +(h->ref_list[j][i].reference&3);
4067     }
4068
4069     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4070     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4071
4072     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4073         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4074                h->slice_num,
4075                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4076                first_mb_in_slice,
4077                av_get_pict_type_char(h->slice_type),
4078                pps_id, h->frame_num,
4079                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4080                h->ref_count[0], h->ref_count[1],
4081                s->qscale,
4082                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4083                h->use_weight,
4084                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4085                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4086                );
4087     }
4088
4089     return 0;
4090 }
4091
4092 /**
4093  *
4094  */
4095 static inline int get_level_prefix(GetBitContext *gb){
4096     unsigned int buf;
4097     int log;
4098
4099     OPEN_READER(re, gb);
4100     UPDATE_CACHE(re, gb);
4101     buf=GET_CACHE(re, gb);
4102
4103     log= 32 - av_log2(buf);
4104 #ifdef TRACE
4105     print_bin(buf>>(32-log), log);
4106     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4107 #endif
4108
4109     LAST_SKIP_BITS(re, gb, log);
4110     CLOSE_READER(re, gb);
4111
4112     return log-1;
4113 }
4114
4115 static inline int get_dct8x8_allowed(H264Context *h){
4116     int i;
4117     for(i=0; i<4; i++){
4118         if(!IS_SUB_8X8(h->sub_mb_type[i])
4119            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4120             return 0;
4121     }
4122     return 1;
4123 }
4124
4125 /**
4126  * decodes a residual block.
4127  * @param n block index
4128  * @param scantable scantable
4129  * @param max_coeff number of coefficients in the block
4130  * @return <0 if an error occurred
4131  */
4132 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4133     MpegEncContext * const s = &h->s;
4134     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4135     int level[16];
4136     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4137
4138     //FIXME put trailing_onex into the context
4139
4140     if(n == CHROMA_DC_BLOCK_INDEX){
4141         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4142         total_coeff= coeff_token>>2;
4143     }else{
4144         if(n == LUMA_DC_BLOCK_INDEX){
4145             total_coeff= pred_non_zero_count(h, 0);
4146             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4147             total_coeff= coeff_token>>2;
4148         }else{
4149             total_coeff= pred_non_zero_count(h, n);
4150             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4151             total_coeff= coeff_token>>2;
4152             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4153         }
4154     }
4155
4156     //FIXME set last_non_zero?
4157
4158     if(total_coeff==0)
4159         return 0;
4160     if(total_coeff > (unsigned)max_coeff) {
4161         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4162         return -1;
4163     }
4164
4165     trailing_ones= coeff_token&3;
4166     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4167     assert(total_coeff<=16);
4168
4169     for(i=0; i<trailing_ones; i++){
4170         level[i]= 1 - 2*get_bits1(gb);
4171     }
4172
4173     if(i<total_coeff) {
4174         int level_code, mask;
4175         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4176         int prefix= get_level_prefix(gb);
4177
4178         //first coefficient has suffix_length equal to 0 or 1
4179         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4180             if(suffix_length)
4181                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4182             else
4183                 level_code= (prefix<<suffix_length); //part
4184         }else if(prefix==14){
4185             if(suffix_length)
4186                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4187             else
4188                 level_code= prefix + get_bits(gb, 4); //part
4189         }else{
4190             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4191             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4192             if(prefix>=16)
4193                 level_code += (1<<(prefix-3))-4096;
4194         }
4195
4196         if(trailing_ones < 3) level_code += 2;
4197
4198         suffix_length = 1;
4199         if(level_code > 5)
4200             suffix_length++;
4201         mask= -(level_code&1);
4202         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4203         i++;
4204
4205         //remaining coefficients have suffix_length > 0
4206         for(;i<total_coeff;i++) {
4207             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4208             prefix = get_level_prefix(gb);
4209             if(prefix<15){
4210                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4211             }else{
4212                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4213                 if(prefix>=16)
4214                     level_code += (1<<(prefix-3))-4096;
4215             }
4216             mask= -(level_code&1);
4217             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4218             if(level_code > suffix_limit[suffix_length])
4219                 suffix_length++;
4220         }
4221     }
4222
4223     if(total_coeff == max_coeff)
4224         zeros_left=0;
4225     else{
4226         if(n == CHROMA_DC_BLOCK_INDEX)
4227             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4228         else
4229             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4230     }
4231
4232     coeff_num = zeros_left + total_coeff - 1;
4233     j = scantable[coeff_num];
4234     if(n > 24){
4235         block[j] = level[0];
4236         for(i=1;i<total_coeff;i++) {
4237             if(zeros_left <= 0)
4238                 run_before = 0;
4239             else if(zeros_left < 7){
4240                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4241             }else{
4242                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4243             }
4244             zeros_left -= run_before;
4245             coeff_num -= 1 + run_before;
4246             j= scantable[ coeff_num ];
4247
4248             block[j]= level[i];
4249         }
4250     }else{
4251         block[j] = (level[0] * qmul[j] + 32)>>6;
4252         for(i=1;i<total_coeff;i++) {
4253             if(zeros_left <= 0)
4254                 run_before = 0;
4255             else if(zeros_left < 7){
4256                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4257             }else{
4258                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4259             }
4260             zeros_left -= run_before;
4261             coeff_num -= 1 + run_before;
4262             j= scantable[ coeff_num ];
4263
4264             block[j]= (level[i] * qmul[j] + 32)>>6;
4265         }
4266     }
4267
4268     if(zeros_left<0){
4269         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4270         return -1;
4271     }
4272
4273     return 0;
4274 }
4275
4276 static void predict_field_decoding_flag(H264Context *h){
4277     MpegEncContext * const s = &h->s;
4278     const int mb_xy= h->mb_xy;
4279     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4280                 ? s->current_picture.mb_type[mb_xy-1]
4281                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4282                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4283                 : 0;
4284     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4285 }
4286
4287 /**
4288  * decodes a P_SKIP or B_SKIP macroblock
4289  */
4290 static void decode_mb_skip(H264Context *h){
4291     MpegEncContext * const s = &h->s;
4292     const int mb_xy= h->mb_xy;
4293     int mb_type=0;
4294
4295     memset(h->non_zero_count[mb_xy], 0, 16);
4296     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4297
4298     if(MB_FIELD)
4299         mb_type|= MB_TYPE_INTERLACED;
4300
4301     if( h->slice_type_nos == FF_B_TYPE )
4302     {
4303         // just for fill_caches. pred_direct_motion will set the real mb_type
4304         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4305
4306         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4307         pred_direct_motion(h, &mb_type);
4308         mb_type|= MB_TYPE_SKIP;
4309     }
4310     else
4311     {
4312         int mx, my;
4313         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4314
4315         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4316         pred_pskip_motion(h, &mx, &my);
4317         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4318         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4319     }
4320
4321     write_back_motion(h, mb_type);
4322     s->current_picture.mb_type[mb_xy]= mb_type;
4323     s->current_picture.qscale_table[mb_xy]= s->qscale;
4324     h->slice_table[ mb_xy ]= h->slice_num;
4325     h->prev_mb_skipped= 1;
4326 }
4327
4328 /**
4329  * decodes a macroblock
4330  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4331  */
4332 static int decode_mb_cavlc(H264Context *h){
4333     MpegEncContext * const s = &h->s;
4334     int mb_xy;
4335     int partition_count;
4336     unsigned int mb_type, cbp;
4337     int dct8x8_allowed= h->pps.transform_8x8_mode;
4338
4339     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4340
4341     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4342
4343     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4344     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4345                 down the code */
4346     if(h->slice_type_nos != FF_I_TYPE){
4347         if(s->mb_skip_run==-1)
4348             s->mb_skip_run= get_ue_golomb(&s->gb);
4349
4350         if (s->mb_skip_run--) {
4351             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4352                 if(s->mb_skip_run==0)
4353                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4354                 else
4355                     predict_field_decoding_flag(h);
4356             }
4357             decode_mb_skip(h);
4358             return 0;
4359         }
4360     }
4361     if(FRAME_MBAFF){
4362         if( (s->mb_y&1) == 0 )
4363             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4364     }
4365
4366     h->prev_mb_skipped= 0;
4367
4368     mb_type= get_ue_golomb(&s->gb);
4369     if(h->slice_type_nos == FF_B_TYPE){
4370         if(mb_type < 23){
4371             partition_count= b_mb_type_info[mb_type].partition_count;
4372             mb_type=         b_mb_type_info[mb_type].type;
4373         }else{
4374             mb_type -= 23;
4375             goto decode_intra_mb;
4376         }
4377     }else if(h->slice_type_nos == FF_P_TYPE){
4378         if(mb_type < 5){
4379             partition_count= p_mb_type_info[mb_type].partition_count;
4380             mb_type=         p_mb_type_info[mb_type].type;
4381         }else{
4382             mb_type -= 5;
4383             goto decode_intra_mb;
4384         }
4385     }else{
4386        assert(h->slice_type_nos == FF_I_TYPE);
4387         if(h->slice_type == FF_SI_TYPE && mb_type)
4388             mb_type--;
4389 decode_intra_mb:
4390         if(mb_type > 25){
4391             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4392             return -1;
4393         }
4394         partition_count=0;
4395         cbp= i_mb_type_info[mb_type].cbp;
4396         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4397         mb_type= i_mb_type_info[mb_type].type;
4398     }
4399
4400     if(MB_FIELD)
4401         mb_type |= MB_TYPE_INTERLACED;
4402
4403     h->slice_table[ mb_xy ]= h->slice_num;
4404
4405     if(IS_INTRA_PCM(mb_type)){
4406         unsigned int x;
4407
4408         // We assume these blocks are very rare so we do not optimize it.
4409         align_get_bits(&s->gb);
4410
4411         // The pixels are stored in the same order as levels in h->mb array.
4412         for(x=0; x < (CHROMA ? 384 : 256); x++){
4413             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4414         }
4415
4416         // In deblocking, the quantizer is 0
4417         s->current_picture.qscale_table[mb_xy]= 0;
4418         // All coeffs are present
4419         memset(h->non_zero_count[mb_xy], 16, 16);
4420
4421         s->current_picture.mb_type[mb_xy]= mb_type;
4422         return 0;
4423     }
4424
4425     if(MB_MBAFF){
4426         h->ref_count[0] <<= 1;
4427         h->ref_count[1] <<= 1;
4428     }
4429
4430     fill_caches(h, mb_type, 0);
4431
4432     //mb_pred
4433     if(IS_INTRA(mb_type)){
4434         int pred_mode;
4435 //            init_top_left_availability(h);
4436         if(IS_INTRA4x4(mb_type)){
4437             int i;
4438             int di = 1;
4439             if(dct8x8_allowed && get_bits1(&s->gb)){
4440                 mb_type |= MB_TYPE_8x8DCT;
4441                 di = 4;
4442             }
4443
4444 //                fill_intra4x4_pred_table(h);
4445             for(i=0; i<16; i+=di){
4446                 int mode= pred_intra_mode(h, i);
4447
4448                 if(!get_bits1(&s->gb)){
4449                     const int rem_mode= get_bits(&s->gb, 3);
4450                     mode = rem_mode + (rem_mode >= mode);
4451                 }
4452
4453                 if(di==4)
4454                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4455                 else
4456                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4457             }
4458             write_back_intra_pred_mode(h);
4459             if( check_intra4x4_pred_mode(h) < 0)
4460                 return -1;
4461         }else{
4462             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4463             if(h->intra16x16_pred_mode < 0)
4464                 return -1;
4465         }
4466         if(CHROMA){
4467             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4468             if(pred_mode < 0)
4469                 return -1;
4470             h->chroma_pred_mode= pred_mode;
4471         }
4472     }else if(partition_count==4){
4473         int i, j, sub_partition_count[4], list, ref[2][4];
4474
4475         if(h->slice_type_nos == FF_B_TYPE){
4476             for(i=0; i<4; i++){
4477                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4478                 if(h->sub_mb_type[i] >=13){
4479                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4480                     return -1;
4481                 }
4482                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4483                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4484             }
4485             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4486                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4487                 pred_direct_motion(h, &mb_type);
4488                 h->ref_cache[0][scan8[4]] =
4489                 h->ref_cache[1][scan8[4]] =
4490                 h->ref_cache[0][scan8[12]] =
4491                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4492             }
4493         }else{
4494             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4495             for(i=0; i<4; i++){
4496                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4497                 if(h->sub_mb_type[i] >=4){
4498                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4499                     return -1;
4500                 }
4501                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4502                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4503             }
4504         }
4505
4506         for(list=0; list<h->list_count; list++){
4507             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4508             for(i=0; i<4; i++){
4509                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4510                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4511                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4512                     if(tmp>=ref_count){
4513                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4514                         return -1;
4515                     }
4516                     ref[list][i]= tmp;
4517                 }else{
4518                  //FIXME
4519                     ref[list][i] = -1;
4520                 }
4521             }
4522         }
4523
4524         if(dct8x8_allowed)
4525             dct8x8_allowed = get_dct8x8_allowed(h);
4526
4527         for(list=0; list<h->list_count; list++){
4528             for(i=0; i<4; i++){
4529                 if(IS_DIRECT(h->sub_mb_type[i])) {
4530                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4531                     continue;
4532                 }
4533                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4534                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4535
4536                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4537                     const int sub_mb_type= h->sub_mb_type[i];
4538                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4539                     for(j=0; j<sub_partition_count[i]; j++){
4540                         int mx, my;
4541                         const int index= 4*i + block_width*j;
4542                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4543                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4544                         mx += get_se_golomb(&s->gb);
4545                         my += get_se_golomb(&s->gb);
4546                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4547
4548                         if(IS_SUB_8X8(sub_mb_type)){
4549                             mv_cache[ 1 ][0]=
4550                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4551                             mv_cache[ 1 ][1]=
4552                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4553                         }else if(IS_SUB_8X4(sub_mb_type)){
4554                             mv_cache[ 1 ][0]= mx;
4555                             mv_cache[ 1 ][1]= my;
4556                         }else if(IS_SUB_4X8(sub_mb_type)){
4557                             mv_cache[ 8 ][0]= mx;
4558                             mv_cache[ 8 ][1]= my;
4559                         }
4560                         mv_cache[ 0 ][0]= mx;
4561                         mv_cache[ 0 ][1]= my;
4562                     }
4563                 }else{
4564                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4565                     p[0] = p[1]=
4566                     p[8] = p[9]= 0;
4567                 }
4568             }
4569         }
4570     }else if(IS_DIRECT(mb_type)){
4571         pred_direct_motion(h, &mb_type);
4572         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4573     }else{
4574         int list, mx, my, i;
4575          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4576         if(IS_16X16(mb_type)){
4577             for(list=0; list<h->list_count; list++){
4578                     unsigned int val;
4579                     if(IS_DIR(mb_type, 0, list)){
4580                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4581                         if(val >= h->ref_count[list]){
4582                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4583                             return -1;
4584                         }
4585                     }else
4586                         val= LIST_NOT_USED&0xFF;
4587                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4588             }
4589             for(list=0; list<h->list_count; list++){
4590                 unsigned int val;
4591                 if(IS_DIR(mb_type, 0, list)){
4592                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4593                     mx += get_se_golomb(&s->gb);
4594                     my += get_se_golomb(&s->gb);
4595                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4596
4597                     val= pack16to32(mx,my);
4598                 }else
4599                     val=0;
4600                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4601             }
4602         }
4603         else if(IS_16X8(mb_type)){
4604             for(list=0; list<h->list_count; list++){
4605                     for(i=0; i<2; i++){
4606                         unsigned int val;
4607                         if(IS_DIR(mb_type, i, list)){
4608                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4609                             if(val >= h->ref_count[list]){
4610                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4611                                 return -1;
4612                             }
4613                         }else
4614                             val= LIST_NOT_USED&0xFF;
4615                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4616                     }
4617             }
4618             for(list=0; list<h->list_count; list++){
4619                 for(i=0; i<2; i++){
4620                     unsigned int val;
4621                     if(IS_DIR(mb_type, i, list)){
4622                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4623                         mx += get_se_golomb(&s->gb);
4624                         my += get_se_golomb(&s->gb);
4625                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4626
4627                         val= pack16to32(mx,my);
4628                     }else
4629                         val=0;
4630                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4631                 }
4632             }
4633         }else{
4634             assert(IS_8X16(mb_type));
4635             for(list=0; list<h->list_count; list++){
4636                     for(i=0; i<2; i++){
4637                         unsigned int val;
4638                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4639                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4640                             if(val >= h->ref_count[list]){
4641                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4642                                 return -1;
4643                             }
4644                         }else
4645                             val= LIST_NOT_USED&0xFF;
4646                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4647                     }
4648             }
4649             for(list=0; list<h->list_count; list++){
4650                 for(i=0; i<2; i++){
4651                     unsigned int val;
4652                     if(IS_DIR(mb_type, i, list)){
4653                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4654                         mx += get_se_golomb(&s->gb);
4655                         my += get_se_golomb(&s->gb);
4656                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4657
4658                         val= pack16to32(mx,my);
4659                     }else
4660                         val=0;
4661                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4662                 }
4663             }
4664         }
4665     }
4666
4667     if(IS_INTER(mb_type))
4668         write_back_motion(h, mb_type);
4669
4670     if(!IS_INTRA16x16(mb_type)){
4671         cbp= get_ue_golomb(&s->gb);
4672         if(cbp > 47){
4673             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4674             return -1;
4675         }
4676
4677         if(CHROMA){
4678             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4679             else                     cbp= golomb_to_inter_cbp   [cbp];
4680         }else{
4681             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4682             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4683         }
4684     }
4685     h->cbp = cbp;
4686
4687     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4688         if(get_bits1(&s->gb)){
4689             mb_type |= MB_TYPE_8x8DCT;
4690             h->cbp_table[mb_xy]= cbp;
4691         }
4692     }
4693     s->current_picture.mb_type[mb_xy]= mb_type;
4694
4695     if(cbp || IS_INTRA16x16(mb_type)){
4696         int i8x8, i4x4, chroma_idx;
4697         int dquant;
4698         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4699         const uint8_t *scan, *scan8x8, *dc_scan;
4700
4701 //        fill_non_zero_count_cache(h);
4702
4703         if(IS_INTERLACED(mb_type)){
4704             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4705             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4706             dc_scan= luma_dc_field_scan;
4707         }else{
4708             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4709             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4710             dc_scan= luma_dc_zigzag_scan;
4711         }
4712
4713         dquant= get_se_golomb(&s->gb);
4714
4715         if( dquant > 25 || dquant < -26 ){
4716             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4717             return -1;
4718         }
4719
4720         s->qscale += dquant;
4721         if(((unsigned)s->qscale) > 51){
4722             if(s->qscale<0) s->qscale+= 52;
4723             else            s->qscale-= 52;
4724         }
4725
4726         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4727         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4728         if(IS_INTRA16x16(mb_type)){
4729             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4730                 return -1; //FIXME continue if partitioned and other return -1 too
4731             }
4732
4733             assert((cbp&15) == 0 || (cbp&15) == 15);
4734
4735             if(cbp&15){
4736                 for(i8x8=0; i8x8<4; i8x8++){
4737                     for(i4x4=0; i4x4<4; i4x4++){
4738                         const int index= i4x4 + 4*i8x8;
4739                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4740                             return -1;
4741                         }
4742                     }
4743                 }
4744             }else{
4745                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4746             }
4747         }else{
4748             for(i8x8=0; i8x8<4; i8x8++){
4749                 if(cbp & (1<<i8x8)){
4750                     if(IS_8x8DCT(mb_type)){
4751                         DCTELEM *buf = &h->mb[64*i8x8];
4752                         uint8_t *nnz;
4753                         for(i4x4=0; i4x4<4; i4x4++){
4754                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4755                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4756                                 return -1;
4757                         }
4758                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4759                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4760                     }else{
4761                         for(i4x4=0; i4x4<4; i4x4++){
4762                             const int index= i4x4 + 4*i8x8;
4763
4764                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4765                                 return -1;
4766                             }
4767                         }
4768                     }
4769                 }else{
4770                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4771                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4772                 }
4773             }
4774         }
4775
4776         if(cbp&0x30){
4777             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4778                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4779                     return -1;
4780                 }
4781         }
4782
4783         if(cbp&0x20){
4784             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4785                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4786                 for(i4x4=0; i4x4<4; i4x4++){
4787                     const int index= 16 + 4*chroma_idx + i4x4;
4788                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4789                         return -1;
4790                     }
4791                 }
4792             }
4793         }else{
4794             uint8_t * const nnz= &h->non_zero_count_cache[0];
4795             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4796             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4797         }
4798     }else{
4799         uint8_t * const nnz= &h->non_zero_count_cache[0];
4800         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4801         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4802         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4803     }
4804     s->current_picture.qscale_table[mb_xy]= s->qscale;
4805     write_back_non_zero_count(h);
4806
4807     if(MB_MBAFF){
4808         h->ref_count[0] >>= 1;
4809         h->ref_count[1] >>= 1;
4810     }
4811
4812     return 0;
4813 }
4814
4815 static int decode_cabac_field_decoding_flag(H264Context *h) {
4816     MpegEncContext * const s = &h->s;
4817     const int mb_x = s->mb_x;
4818     const int mb_y = s->mb_y & ~1;
4819     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4820     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4821
4822     unsigned int ctx = 0;
4823
4824     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4825         ctx += 1;
4826     }
4827     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4828         ctx += 1;
4829     }
4830
4831     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4832 }
4833
4834 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4835     uint8_t *state= &h->cabac_state[ctx_base];
4836     int mb_type;
4837
4838     if(intra_slice){
4839         MpegEncContext * const s = &h->s;
4840         const int mba_xy = h->left_mb_xy[0];
4841         const int mbb_xy = h->top_mb_xy;
4842         int ctx=0;
4843         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4844             ctx++;
4845         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4846             ctx++;
4847         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4848             return 0;   /* I4x4 */
4849         state += 2;
4850     }else{
4851         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4852             return 0;   /* I4x4 */
4853     }
4854
4855     if( get_cabac_terminate( &h->cabac ) )
4856         return 25;  /* PCM */
4857
4858     mb_type = 1; /* I16x16 */
4859     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4860     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4861         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4862     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4863     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4864     return mb_type;
4865 }
4866
4867 static int decode_cabac_mb_type( H264Context *h ) {
4868     MpegEncContext * const s = &h->s;
4869
4870     if( h->slice_type_nos == FF_I_TYPE ) {
4871         return decode_cabac_intra_mb_type(h, 3, 1);
4872     } else if( h->slice_type_nos == FF_P_TYPE ) {
4873         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4874             /* P-type */
4875             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4876                 /* P_L0_D16x16, P_8x8 */
4877                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4878             } else {
4879                 /* P_L0_D8x16, P_L0_D16x8 */
4880                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4881             }
4882         } else {
4883             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4884         }
4885     } else if( h->slice_type_nos == FF_B_TYPE ) {
4886         const int mba_xy = h->left_mb_xy[0];
4887         const int mbb_xy = h->top_mb_xy;
4888         int ctx = 0;
4889         int bits;
4890
4891         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4892             ctx++;
4893         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4894             ctx++;
4895
4896         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4897             return 0; /* B_Direct_16x16 */
4898
4899         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4900             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4901         }
4902
4903         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4904         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4905         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4906         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4907         if( bits < 8 )
4908             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4909         else if( bits == 13 ) {
4910             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4911         } else if( bits == 14 )
4912             return 11; /* B_L1_L0_8x16 */
4913         else if( bits == 15 )
4914             return 22; /* B_8x8 */
4915
4916         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4917         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4918     } else {
4919         /* TODO SI/SP frames? */
4920         return -1;
4921     }
4922 }
4923
4924 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4925     MpegEncContext * const s = &h->s;
4926     int mba_xy, mbb_xy;
4927     int ctx = 0;
4928
4929     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4930         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4931         mba_xy = mb_xy - 1;
4932         if( (mb_y&1)
4933             && h->slice_table[mba_xy] == h->slice_num
4934             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4935             mba_xy += s->mb_stride;
4936         if( MB_FIELD ){
4937             mbb_xy = mb_xy - s->mb_stride;
4938             if( !(mb_y&1)
4939                 && h->slice_table[mbb_xy] == h->slice_num
4940                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4941                 mbb_xy -= s->mb_stride;
4942         }else
4943             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4944     }else{
4945         int mb_xy = h->mb_xy;
4946         mba_xy = mb_xy - 1;
4947         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4948     }
4949
4950     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4951         ctx++;
4952     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4953         ctx++;
4954
4955     if( h->slice_type_nos == FF_B_TYPE )
4956         ctx += 13;
4957     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4958 }
4959
4960 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4961     int mode = 0;
4962
4963     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4964         return pred_mode;
4965
4966     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4967     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4968     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4969
4970     if( mode >= pred_mode )
4971         return mode + 1;
4972     else
4973         return mode;
4974 }
4975
4976 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4977     const int mba_xy = h->left_mb_xy[0];
4978     const int mbb_xy = h->top_mb_xy;
4979
4980     int ctx = 0;
4981
4982     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4983     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4984         ctx++;
4985
4986     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4987         ctx++;
4988
4989     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4990         return 0;
4991
4992     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4993         return 1;
4994     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4995         return 2;
4996     else
4997         return 3;
4998 }
4999
5000 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5001     int cbp_b, cbp_a, ctx, cbp = 0;
5002
5003     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5004     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5005
5006     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5007     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5008     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5009     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5010     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5011     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5012     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5013     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5014     return cbp;
5015 }
5016 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5017     int ctx;
5018     int cbp_a, cbp_b;
5019
5020     cbp_a = (h->left_cbp>>4)&0x03;
5021     cbp_b = (h-> top_cbp>>4)&0x03;
5022
5023     ctx = 0;
5024     if( cbp_a > 0 ) ctx++;
5025     if( cbp_b > 0 ) ctx += 2;
5026     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5027         return 0;
5028
5029     ctx = 4;
5030     if( cbp_a == 2 ) ctx++;
5031     if( cbp_b == 2 ) ctx += 2;
5032     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5033 }
5034 static int decode_cabac_mb_dqp( H264Context *h) {
5035     int   ctx = 0;
5036     int   val = 0;
5037
5038     if( h->last_qscale_diff != 0 )
5039         ctx++;
5040
5041     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5042         if( ctx < 2 )
5043             ctx = 2;
5044         else
5045             ctx = 3;
5046         val++;
5047         if(val > 102) //prevent infinite loop
5048             return INT_MIN;
5049     }
5050
5051     if( val&0x01 )
5052         return (val + 1)/2;
5053     else
5054         return -(val + 1)/2;
5055 }
5056 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5057     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5058         return 0;   /* 8x8 */
5059     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5060         return 1;   /* 8x4 */
5061     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5062         return 2;   /* 4x8 */
5063     return 3;       /* 4x4 */
5064 }
5065 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5066     int type;
5067     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5068         return 0;   /* B_Direct_8x8 */
5069     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5070         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5071     type = 3;
5072     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5073         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5074             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5075         type += 4;
5076     }
5077     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5078     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5079     return type;
5080 }
5081
5082 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5083     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5084 }
5085
5086 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5087     int refa = h->ref_cache[list][scan8[n] - 1];
5088     int refb = h->ref_cache[list][scan8[n] - 8];
5089     int ref  = 0;
5090     int ctx  = 0;
5091
5092     if( h->slice_type_nos == FF_B_TYPE) {
5093         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5094             ctx++;
5095         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5096             ctx += 2;
5097     } else {
5098         if( refa > 0 )
5099             ctx++;
5100         if( refb > 0 )
5101             ctx += 2;
5102     }
5103
5104     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5105         ref++;
5106         if( ctx < 4 )
5107             ctx = 4;
5108         else
5109             ctx = 5;
5110         if(ref >= 32 /*h->ref_list[list]*/){
5111             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5112             return 0; //FIXME we should return -1 and check the return everywhere
5113         }
5114     }
5115     return ref;
5116 }
5117
5118 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5119     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5120                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5121     int ctxbase = (l == 0) ? 40 : 47;
5122     int ctx, mvd;
5123
5124     if( amvd < 3 )
5125         ctx = 0;
5126     else if( amvd > 32 )
5127         ctx = 2;
5128     else
5129         ctx = 1;
5130
5131     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5132         return 0;
5133
5134     mvd= 1;
5135     ctx= 3;
5136     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5137         mvd++;
5138         if( ctx < 6 )
5139             ctx++;
5140     }
5141
5142     if( mvd >= 9 ) {
5143         int k = 3;
5144         while( get_cabac_bypass( &h->cabac ) ) {
5145             mvd += 1 << k;
5146             k++;
5147             if(k>24){
5148                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5149                 return INT_MIN;
5150             }
5151         }
5152         while( k-- ) {
5153             if( get_cabac_bypass( &h->cabac ) )
5154                 mvd += 1 << k;
5155         }
5156     }
5157     return get_cabac_bypass_sign( &h->cabac, -mvd );
5158 }
5159
5160 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5161     int nza, nzb;
5162     int ctx = 0;
5163
5164     if( is_dc ) {
5165         if( cat == 0 ) {
5166             nza = h->left_cbp&0x100;
5167             nzb = h-> top_cbp&0x100;
5168         } else {
5169             nza = (h->left_cbp>>(6+idx))&0x01;
5170             nzb = (h-> top_cbp>>(6+idx))&0x01;
5171         }
5172     } else {
5173         if( cat == 4 ) {
5174             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5175             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5176         } else {
5177             assert(cat == 1 || cat == 2);
5178             nza = h->non_zero_count_cache[scan8[idx] - 1];
5179             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5180         }
5181     }
5182
5183     if( nza > 0 )
5184         ctx++;
5185
5186     if( nzb > 0 )
5187         ctx += 2;
5188
5189     return ctx + 4 * cat;
5190 }
5191
5192 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5193     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5194     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5195     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5196     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5197 };
5198
5199 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5200     static const int significant_coeff_flag_offset[2][6] = {
5201       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5202       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5203     };
5204     static const int last_coeff_flag_offset[2][6] = {
5205       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5206       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5207     };
5208     static const int coeff_abs_level_m1_offset[6] = {
5209         227+0, 227+10, 227+20, 227+30, 227+39, 426
5210     };
5211     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5212       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5213         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5214         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5215        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5216       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5217         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5218         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5219         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5220     };
5221     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5222      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5223      * map node ctx => cabac ctx for level=1 */
5224     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5225     /* map node ctx => cabac ctx for level>1 */
5226     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5227     static const uint8_t coeff_abs_level_transition[2][8] = {
5228     /* update node ctx after decoding a level=1 */
5229         { 1, 2, 3, 3, 4, 5, 6, 7 },
5230     /* update node ctx after decoding a level>1 */
5231         { 4, 4, 4, 4, 5, 6, 7, 7 }
5232     };
5233
5234     int index[64];
5235
5236     int av_unused last;
5237     int coeff_count = 0;
5238     int node_ctx = 0;
5239
5240     uint8_t *significant_coeff_ctx_base;
5241     uint8_t *last_coeff_ctx_base;
5242     uint8_t *abs_level_m1_ctx_base;
5243
5244 #ifndef ARCH_X86
5245 #define CABAC_ON_STACK
5246 #endif
5247 #ifdef CABAC_ON_STACK
5248 #define CC &cc
5249     CABACContext cc;
5250     cc.range     = h->cabac.range;
5251     cc.low       = h->cabac.low;
5252     cc.bytestream= h->cabac.bytestream;
5253 #else
5254 #define CC &h->cabac
5255 #endif
5256
5257
5258     /* cat: 0-> DC 16x16  n = 0
5259      *      1-> AC 16x16  n = luma4x4idx
5260      *      2-> Luma4x4   n = luma4x4idx
5261      *      3-> DC Chroma n = iCbCr
5262      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5263      *      5-> Luma8x8   n = 4 * luma8x8idx
5264      */
5265
5266     /* read coded block flag */
5267     if( is_dc || cat != 5 ) {
5268         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5269             if( !is_dc ) {
5270                 if( cat == 4 )
5271                     h->non_zero_count_cache[scan8[16+n]] = 0;
5272                 else
5273                     h->non_zero_count_cache[scan8[n]] = 0;
5274             }
5275
5276 #ifdef CABAC_ON_STACK
5277             h->cabac.range     = cc.range     ;
5278             h->cabac.low       = cc.low       ;
5279             h->cabac.bytestream= cc.bytestream;
5280 #endif
5281             return;
5282         }
5283     }
5284
5285     significant_coeff_ctx_base = h->cabac_state
5286         + significant_coeff_flag_offset[MB_FIELD][cat];
5287     last_coeff_ctx_base = h->cabac_state
5288         + last_coeff_flag_offset[MB_FIELD][cat];
5289     abs_level_m1_ctx_base = h->cabac_state
5290         + coeff_abs_level_m1_offset[cat];
5291
5292     if( !is_dc && cat == 5 ) {
5293 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5294         for(last= 0; last < coefs; last++) { \
5295             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5296             if( get_cabac( CC, sig_ctx )) { \
5297                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5298                 index[coeff_count++] = last; \
5299                 if( get_cabac( CC, last_ctx ) ) { \
5300                     last= max_coeff; \
5301                     break; \
5302                 } \
5303             } \
5304         }\
5305         if( last == max_coeff -1 ) {\
5306             index[coeff_count++] = last;\
5307         }
5308         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5309 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5310         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5311     } else {
5312         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5313 #else
5314         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5315     } else {
5316         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5317 #endif
5318     }
5319     assert(coeff_count > 0);
5320
5321     if( is_dc ) {
5322         if( cat == 0 )
5323             h->cbp_table[h->mb_xy] |= 0x100;
5324         else
5325             h->cbp_table[h->mb_xy] |= 0x40 << n;
5326     } else {
5327         if( cat == 5 )
5328             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5329         else if( cat == 4 )
5330             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5331         else {
5332             assert( cat == 1 || cat == 2 );
5333             h->non_zero_count_cache[scan8[n]] = coeff_count;
5334         }
5335     }
5336
5337     do {
5338         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5339
5340         int j= scantable[index[--coeff_count]];
5341
5342         if( get_cabac( CC, ctx ) == 0 ) {
5343             node_ctx = coeff_abs_level_transition[0][node_ctx];
5344             if( is_dc ) {
5345                 block[j] = get_cabac_bypass_sign( CC, -1);
5346             }else{
5347                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5348             }
5349         } else {
5350             int coeff_abs = 2;
5351             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5352             node_ctx = coeff_abs_level_transition[1][node_ctx];
5353
5354             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5355                 coeff_abs++;
5356             }
5357
5358             if( coeff_abs >= 15 ) {
5359                 int j = 0;
5360                 while( get_cabac_bypass( CC ) ) {
5361                     j++;
5362                 }
5363
5364                 coeff_abs=1;
5365                 while( j-- ) {
5366                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5367                 }
5368                 coeff_abs+= 14;
5369             }
5370
5371             if( is_dc ) {
5372                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5373             }else{
5374                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5375             }
5376         }
5377     } while( coeff_count );
5378 #ifdef CABAC_ON_STACK
5379             h->cabac.range     = cc.range     ;
5380             h->cabac.low       = cc.low       ;
5381             h->cabac.bytestream= cc.bytestream;
5382 #endif
5383
5384 }
5385
5386 #ifndef CONFIG_SMALL
5387 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5388     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5389 }
5390
5391 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5392     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5393 }
5394 #endif
5395
5396 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5397 #ifdef CONFIG_SMALL
5398     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5399 #else
5400     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5401     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5402 #endif
5403 }
5404
5405 static inline void compute_mb_neighbors(H264Context *h)
5406 {
5407     MpegEncContext * const s = &h->s;
5408     const int mb_xy  = h->mb_xy;
5409     h->top_mb_xy     = mb_xy - s->mb_stride;
5410     h->left_mb_xy[0] = mb_xy - 1;
5411     if(FRAME_MBAFF){
5412         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5413         const int top_pair_xy      = pair_xy     - s->mb_stride;
5414         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5415         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5416         const int curr_mb_frame_flag = !MB_FIELD;
5417         const int bottom = (s->mb_y & 1);
5418         if (bottom
5419                 ? !curr_mb_frame_flag // bottom macroblock
5420                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5421                 ) {
5422             h->top_mb_xy -= s->mb_stride;
5423         }
5424         if (left_mb_frame_flag != curr_mb_frame_flag) {
5425             h->left_mb_xy[0] = pair_xy - 1;
5426         }
5427     } else if (FIELD_PICTURE) {
5428         h->top_mb_xy -= s->mb_stride;
5429     }
5430     return;
5431 }
5432
5433 /**
5434  * decodes a macroblock
5435  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5436  */
5437 static int decode_mb_cabac(H264Context *h) {
5438     MpegEncContext * const s = &h->s;
5439     int mb_xy;
5440     int mb_type, partition_count, cbp = 0;
5441     int dct8x8_allowed= h->pps.transform_8x8_mode;
5442
5443     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5444
5445     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5446
5447     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5448     if( h->slice_type_nos != FF_I_TYPE ) {
5449         int skip;
5450         /* a skipped mb needs the aff flag from the following mb */
5451         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5452             predict_field_decoding_flag(h);
5453         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5454             skip = h->next_mb_skipped;
5455         else
5456             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5457         /* read skip flags */
5458         if( skip ) {
5459             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5460                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5461                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5462                 if(h->next_mb_skipped)
5463                     predict_field_decoding_flag(h);
5464                 else
5465                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5466             }
5467
5468             decode_mb_skip(h);
5469
5470             h->cbp_table[mb_xy] = 0;
5471             h->chroma_pred_mode_table[mb_xy] = 0;
5472             h->last_qscale_diff = 0;
5473
5474             return 0;
5475
5476         }
5477     }
5478     if(FRAME_MBAFF){
5479         if( (s->mb_y&1) == 0 )
5480             h->mb_mbaff =
5481             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5482     }
5483
5484     h->prev_mb_skipped = 0;
5485
5486     compute_mb_neighbors(h);
5487     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5488         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5489         return -1;
5490     }
5491
5492     if( h->slice_type_nos == FF_B_TYPE ) {
5493         if( mb_type < 23 ){
5494             partition_count= b_mb_type_info[mb_type].partition_count;
5495             mb_type=         b_mb_type_info[mb_type].type;
5496         }else{
5497             mb_type -= 23;
5498             goto decode_intra_mb;
5499         }
5500     } else if( h->slice_type_nos == FF_P_TYPE ) {
5501         if( mb_type < 5) {
5502             partition_count= p_mb_type_info[mb_type].partition_count;
5503             mb_type=         p_mb_type_info[mb_type].type;
5504         } else {
5505             mb_type -= 5;
5506             goto decode_intra_mb;
5507         }
5508     } else {
5509         if(h->slice_type == FF_SI_TYPE && mb_type)
5510             mb_type--;
5511         assert(h->slice_type_nos == FF_I_TYPE);
5512 decode_intra_mb:
5513         partition_count = 0;
5514         cbp= i_mb_type_info[mb_type].cbp;
5515         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5516         mb_type= i_mb_type_info[mb_type].type;
5517     }
5518     if(MB_FIELD)
5519         mb_type |= MB_TYPE_INTERLACED;
5520
5521     h->slice_table[ mb_xy ]= h->slice_num;
5522
5523     if(IS_INTRA_PCM(mb_type)) {
5524         const uint8_t *ptr;
5525
5526         // We assume these blocks are very rare so we do not optimize it.
5527         // FIXME The two following lines get the bitstream position in the cabac
5528         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5529         ptr= h->cabac.bytestream;
5530         if(h->cabac.low&0x1) ptr--;
5531         if(CABAC_BITS==16){
5532             if(h->cabac.low&0x1FF) ptr--;
5533         }
5534
5535         // The pixels are stored in the same order as levels in h->mb array.
5536         memcpy(h->mb, ptr, 256); ptr+=256;
5537         if(CHROMA){
5538             memcpy(h->mb+128, ptr, 128); ptr+=128;
5539         }
5540
5541         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5542
5543         // All blocks are present
5544         h->cbp_table[mb_xy] = 0x1ef;
5545         h->chroma_pred_mode_table[mb_xy] = 0;
5546         // In deblocking, the quantizer is 0
5547         s->current_picture.qscale_table[mb_xy]= 0;
5548         // All coeffs are present
5549         memset(h->non_zero_count[mb_xy], 16, 16);
5550         s->current_picture.mb_type[mb_xy]= mb_type;
5551         h->last_qscale_diff = 0;
5552         return 0;
5553     }
5554
5555     if(MB_MBAFF){
5556         h->ref_count[0] <<= 1;
5557         h->ref_count[1] <<= 1;
5558     }
5559
5560     fill_caches(h, mb_type, 0);
5561
5562     if( IS_INTRA( mb_type ) ) {
5563         int i, pred_mode;
5564         if( IS_INTRA4x4( mb_type ) ) {
5565             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5566                 mb_type |= MB_TYPE_8x8DCT;
5567                 for( i = 0; i < 16; i+=4 ) {
5568                     int pred = pred_intra_mode( h, i );
5569                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5570                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5571                 }
5572             } else {
5573                 for( i = 0; i < 16; i++ ) {
5574                     int pred = pred_intra_mode( h, i );
5575                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5576
5577                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5578                 }
5579             }
5580             write_back_intra_pred_mode(h);
5581             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5582         } else {
5583             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5584             if( h->intra16x16_pred_mode < 0 ) return -1;
5585         }
5586         if(CHROMA){
5587             h->chroma_pred_mode_table[mb_xy] =
5588             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5589
5590             pred_mode= check_intra_pred_mode( h, pred_mode );
5591             if( pred_mode < 0 ) return -1;
5592             h->chroma_pred_mode= pred_mode;
5593         }
5594     } else if( partition_count == 4 ) {
5595         int i, j, sub_partition_count[4], list, ref[2][4];
5596
5597         if( h->slice_type_nos == FF_B_TYPE ) {
5598             for( i = 0; i < 4; i++ ) {
5599                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5600                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5601                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5602             }
5603             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5604                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5605                 pred_direct_motion(h, &mb_type);
5606                 h->ref_cache[0][scan8[4]] =
5607                 h->ref_cache[1][scan8[4]] =
5608                 h->ref_cache[0][scan8[12]] =
5609                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5610                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5611                     for( i = 0; i < 4; i++ )
5612                         if( IS_DIRECT(h->sub_mb_type[i]) )
5613                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5614                 }
5615             }
5616         } else {
5617             for( i = 0; i < 4; i++ ) {
5618                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5619                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5620                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5621             }
5622         }
5623
5624         for( list = 0; list < h->list_count; list++ ) {
5625                 for( i = 0; i < 4; i++ ) {
5626                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5627                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5628                         if( h->ref_count[list] > 1 )
5629                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5630                         else
5631                             ref[list][i] = 0;
5632                     } else {
5633                         ref[list][i] = -1;
5634                     }
5635                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5636                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5637                 }
5638         }
5639
5640         if(dct8x8_allowed)
5641             dct8x8_allowed = get_dct8x8_allowed(h);
5642
5643         for(list=0; list<h->list_count; list++){
5644             for(i=0; i<4; i++){
5645                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5646                 if(IS_DIRECT(h->sub_mb_type[i])){
5647                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5648                     continue;
5649                 }
5650
5651                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5652                     const int sub_mb_type= h->sub_mb_type[i];
5653                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5654                     for(j=0; j<sub_partition_count[i]; j++){
5655                         int mpx, mpy;
5656                         int mx, my;
5657                         const int index= 4*i + block_width*j;
5658                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5659                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5660                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5661
5662                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5663                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5664                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5665
5666                         if(IS_SUB_8X8(sub_mb_type)){
5667                             mv_cache[ 1 ][0]=
5668                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5669                             mv_cache[ 1 ][1]=
5670                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5671
5672                             mvd_cache[ 1 ][0]=
5673                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5674                             mvd_cache[ 1 ][1]=
5675                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5676                         }else if(IS_SUB_8X4(sub_mb_type)){
5677                             mv_cache[ 1 ][0]= mx;
5678                             mv_cache[ 1 ][1]= my;
5679
5680                             mvd_cache[ 1 ][0]= mx - mpx;
5681                             mvd_cache[ 1 ][1]= my - mpy;
5682                         }else if(IS_SUB_4X8(sub_mb_type)){
5683                             mv_cache[ 8 ][0]= mx;
5684                             mv_cache[ 8 ][1]= my;
5685
5686                             mvd_cache[ 8 ][0]= mx - mpx;
5687                             mvd_cache[ 8 ][1]= my - mpy;
5688                         }
5689                         mv_cache[ 0 ][0]= mx;
5690                         mv_cache[ 0 ][1]= my;
5691
5692                         mvd_cache[ 0 ][0]= mx - mpx;
5693                         mvd_cache[ 0 ][1]= my - mpy;
5694                     }
5695                 }else{
5696                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5697                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5698                     p[0] = p[1] = p[8] = p[9] = 0;
5699                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5700                 }
5701             }
5702         }
5703     } else if( IS_DIRECT(mb_type) ) {
5704         pred_direct_motion(h, &mb_type);
5705         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5706         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5707         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5708     } else {
5709         int list, mx, my, i, mpx, mpy;
5710         if(IS_16X16(mb_type)){
5711             for(list=0; list<h->list_count; list++){
5712                 if(IS_DIR(mb_type, 0, list)){
5713                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5714                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5715                 }else
5716                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5717             }
5718             for(list=0; list<h->list_count; list++){
5719                 if(IS_DIR(mb_type, 0, list)){
5720                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5721
5722                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5723                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5724                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5725
5726                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5727                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5728                 }else
5729                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5730             }
5731         }
5732         else if(IS_16X8(mb_type)){
5733             for(list=0; list<h->list_count; list++){
5734                     for(i=0; i<2; i++){
5735                         if(IS_DIR(mb_type, i, list)){
5736                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5737                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5738                         }else
5739                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5740                     }
5741             }
5742             for(list=0; list<h->list_count; list++){
5743                 for(i=0; i<2; i++){
5744                     if(IS_DIR(mb_type, i, list)){
5745                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5746                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5747                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5748                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5749
5750                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5751                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5752                     }else{
5753                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5754                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5755                     }
5756                 }
5757             }
5758         }else{
5759             assert(IS_8X16(mb_type));
5760             for(list=0; list<h->list_count; list++){
5761                     for(i=0; i<2; i++){
5762                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5763                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5764                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5765                         }else
5766                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5767                     }
5768             }
5769             for(list=0; list<h->list_count; list++){
5770                 for(i=0; i<2; i++){
5771                     if(IS_DIR(mb_type, i, list)){
5772                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5773                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5774                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5775
5776                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5777                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5778                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5779                     }else{
5780                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5781                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5782                     }
5783                 }
5784             }
5785         }
5786     }
5787
5788    if( IS_INTER( mb_type ) ) {
5789         h->chroma_pred_mode_table[mb_xy] = 0;
5790         write_back_motion( h, mb_type );
5791    }
5792
5793     if( !IS_INTRA16x16( mb_type ) ) {
5794         cbp  = decode_cabac_mb_cbp_luma( h );
5795         if(CHROMA)
5796             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5797     }
5798
5799     h->cbp_table[mb_xy] = h->cbp = cbp;
5800
5801     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5802         if( decode_cabac_mb_transform_size( h ) )
5803             mb_type |= MB_TYPE_8x8DCT;
5804     }
5805     s->current_picture.mb_type[mb_xy]= mb_type;
5806
5807     if( cbp || IS_INTRA16x16( mb_type ) ) {
5808         const uint8_t *scan, *scan8x8, *dc_scan;
5809         const uint32_t *qmul;
5810         int dqp;
5811
5812         if(IS_INTERLACED(mb_type)){
5813             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5814             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5815             dc_scan= luma_dc_field_scan;
5816         }else{
5817             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5818             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5819             dc_scan= luma_dc_zigzag_scan;
5820         }
5821
5822         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5823         if( dqp == INT_MIN ){
5824             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5825             return -1;
5826         }
5827         s->qscale += dqp;
5828         if(((unsigned)s->qscale) > 51){
5829             if(s->qscale<0) s->qscale+= 52;
5830             else            s->qscale-= 52;
5831         }
5832         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5833         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5834
5835         if( IS_INTRA16x16( mb_type ) ) {
5836             int i;
5837             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5838             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5839
5840             if( cbp&15 ) {
5841                 qmul = h->dequant4_coeff[0][s->qscale];
5842                 for( i = 0; i < 16; i++ ) {
5843                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5844                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5845                 }
5846             } else {
5847                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5848             }
5849         } else {
5850             int i8x8, i4x4;
5851             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5852                 if( cbp & (1<<i8x8) ) {
5853                     if( IS_8x8DCT(mb_type) ) {
5854                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5855                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5856                     } else {
5857                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5858                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5859                             const int index = 4*i8x8 + i4x4;
5860                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5861 //START_TIMER
5862                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5863 //STOP_TIMER("decode_residual")
5864                         }
5865                     }
5866                 } else {
5867                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5868                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5869                 }
5870             }
5871         }
5872
5873         if( cbp&0x30 ){
5874             int c;
5875             for( c = 0; c < 2; c++ ) {
5876                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5877                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5878             }
5879         }
5880
5881         if( cbp&0x20 ) {
5882             int c, i;
5883             for( c = 0; c < 2; c++ ) {
5884                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5885                 for( i = 0; i < 4; i++ ) {
5886                     const int index = 16 + 4 * c + i;
5887                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5888                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5889                 }
5890             }
5891         } else {
5892             uint8_t * const nnz= &h->non_zero_count_cache[0];
5893             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5894             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5895         }
5896     } else {
5897         uint8_t * const nnz= &h->non_zero_count_cache[0];
5898         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5899         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5900         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5901         h->last_qscale_diff = 0;
5902     }
5903
5904     s->current_picture.qscale_table[mb_xy]= s->qscale;
5905     write_back_non_zero_count(h);
5906
5907     if(MB_MBAFF){
5908         h->ref_count[0] >>= 1;
5909         h->ref_count[1] >>= 1;
5910     }
5911
5912     return 0;
5913 }
5914
5915
5916 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5917     int i, d;
5918     const int index_a = qp + h->slice_alpha_c0_offset;
5919     const int alpha = (alpha_table+52)[index_a];
5920     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5921
5922     if( bS[0] < 4 ) {
5923         int8_t tc[4];
5924         for(i=0; i<4; i++)
5925             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5926         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5927     } else {
5928         /* 16px edge length, because bS=4 is triggered by being at
5929          * the edge of an intra MB, so all 4 bS are the same */
5930             for( d = 0; d < 16; d++ ) {
5931                 const int p0 = pix[-1];
5932                 const int p1 = pix[-2];
5933                 const int p2 = pix[-3];
5934
5935                 const int q0 = pix[0];
5936                 const int q1 = pix[1];
5937                 const int q2 = pix[2];
5938
5939                 if( FFABS( p0 - q0 ) < alpha &&
5940                     FFABS( p1 - p0 ) < beta &&
5941                     FFABS( q1 - q0 ) < beta ) {
5942
5943                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5944                         if( FFABS( p2 - p0 ) < beta)
5945                         {
5946                             const int p3 = pix[-4];
5947                             /* p0', p1', p2' */
5948                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5949                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5950                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5951                         } else {
5952                             /* p0' */
5953                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5954                         }
5955                         if( FFABS( q2 - q0 ) < beta)
5956                         {
5957                             const int q3 = pix[3];
5958                             /* q0', q1', q2' */
5959                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5960                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5961                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5962                         } else {
5963                             /* q0' */
5964                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5965                         }
5966                     }else{
5967                         /* p0', q0' */
5968                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5969                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5970                     }
5971                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5972                 }
5973                 pix += stride;
5974             }
5975     }
5976 }
5977 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5978     int i;
5979     const int index_a = qp + h->slice_alpha_c0_offset;
5980     const int alpha = (alpha_table+52)[index_a];
5981     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5982
5983     if( bS[0] < 4 ) {
5984         int8_t tc[4];
5985         for(i=0; i<4; i++)
5986             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5987         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5988     } else {
5989         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5990     }
5991 }
5992
5993 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5994     int i;
5995     for( i = 0; i < 16; i++, pix += stride) {
5996         int index_a;
5997         int alpha;
5998         int beta;
5999
6000         int qp_index;
6001         int bS_index = (i >> 1);
6002         if (!MB_FIELD) {
6003             bS_index &= ~1;
6004             bS_index |= (i & 1);
6005         }
6006
6007         if( bS[bS_index] == 0 ) {
6008             continue;
6009         }
6010
6011         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6012         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6013         alpha = (alpha_table+52)[index_a];
6014         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6015
6016         if( bS[bS_index] < 4 ) {
6017             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6018             const int p0 = pix[-1];
6019             const int p1 = pix[-2];
6020             const int p2 = pix[-3];
6021             const int q0 = pix[0];
6022             const int q1 = pix[1];
6023             const int q2 = pix[2];
6024
6025             if( FFABS( p0 - q0 ) < alpha &&
6026                 FFABS( p1 - p0 ) < beta &&
6027                 FFABS( q1 - q0 ) < beta ) {
6028                 int tc = tc0;
6029                 int i_delta;
6030
6031                 if( FFABS( p2 - p0 ) < beta ) {
6032                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6033                     tc++;
6034                 }
6035                 if( FFABS( q2 - q0 ) < beta ) {
6036                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6037                     tc++;
6038                 }
6039
6040                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6041                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6042                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6043                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6044             }
6045         }else{
6046             const int p0 = pix[-1];
6047             const int p1 = pix[-2];
6048             const int p2 = pix[-3];
6049
6050             const int q0 = pix[0];
6051             const int q1 = pix[1];
6052             const int q2 = pix[2];
6053
6054             if( FFABS( p0 - q0 ) < alpha &&
6055                 FFABS( p1 - p0 ) < beta &&
6056                 FFABS( q1 - q0 ) < beta ) {
6057
6058                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6059                     if( FFABS( p2 - p0 ) < beta)
6060                     {
6061                         const int p3 = pix[-4];
6062                         /* p0', p1', p2' */
6063                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6064                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6065                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6066                     } else {
6067                         /* p0' */
6068                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6069                     }
6070                     if( FFABS( q2 - q0 ) < beta)
6071                     {
6072                         const int q3 = pix[3];
6073                         /* q0', q1', q2' */
6074                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6075                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6076                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6077                     } else {
6078                         /* q0' */
6079                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6080                     }
6081                 }else{
6082                     /* p0', q0' */
6083                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6084                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6085                 }
6086                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6087             }
6088         }
6089     }
6090 }
6091 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6092     int i;
6093     for( i = 0; i < 8; i++, pix += stride) {
6094         int index_a;
6095         int alpha;
6096         int beta;
6097
6098         int qp_index;
6099         int bS_index = i;
6100
6101         if( bS[bS_index] == 0 ) {
6102             continue;
6103         }
6104
6105         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6106         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6107         alpha = (alpha_table+52)[index_a];
6108         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6109
6110         if( bS[bS_index] < 4 ) {
6111             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6112             const int p0 = pix[-1];
6113             const int p1 = pix[-2];
6114             const int q0 = pix[0];
6115             const int q1 = pix[1];
6116
6117             if( FFABS( p0 - q0 ) < alpha &&
6118                 FFABS( p1 - p0 ) < beta &&
6119                 FFABS( q1 - q0 ) < beta ) {
6120                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6121
6122                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6123                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6124                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6125             }
6126         }else{
6127             const int p0 = pix[-1];
6128             const int p1 = pix[-2];
6129             const int q0 = pix[0];
6130             const int q1 = pix[1];
6131
6132             if( FFABS( p0 - q0 ) < alpha &&
6133                 FFABS( p1 - p0 ) < beta &&
6134                 FFABS( q1 - q0 ) < beta ) {
6135
6136                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6137                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6138                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6139             }
6140         }
6141     }
6142 }
6143
6144 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6145     int i, d;
6146     const int index_a = qp + h->slice_alpha_c0_offset;
6147     const int alpha = (alpha_table+52)[index_a];
6148     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6149     const int pix_next  = stride;
6150
6151     if( bS[0] < 4 ) {
6152         int8_t tc[4];
6153         for(i=0; i<4; i++)
6154             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6155         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6156     } else {
6157         /* 16px edge length, see filter_mb_edgev */
6158             for( d = 0; d < 16; d++ ) {
6159                 const int p0 = pix[-1*pix_next];
6160                 const int p1 = pix[-2*pix_next];
6161                 const int p2 = pix[-3*pix_next];
6162                 const int q0 = pix[0];
6163                 const int q1 = pix[1*pix_next];
6164                 const int q2 = pix[2*pix_next];
6165
6166                 if( FFABS( p0 - q0 ) < alpha &&
6167                     FFABS( p1 - p0 ) < beta &&
6168                     FFABS( q1 - q0 ) < beta ) {
6169
6170                     const int p3 = pix[-4*pix_next];
6171                     const int q3 = pix[ 3*pix_next];
6172
6173                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6174                         if( FFABS( p2 - p0 ) < beta) {
6175                             /* p0', p1', p2' */
6176                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6177                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6178                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6179                         } else {
6180                             /* p0' */
6181                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6182                         }
6183                         if( FFABS( q2 - q0 ) < beta) {
6184                             /* q0', q1', q2' */
6185                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6186                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6187                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6188                         } else {
6189                             /* q0' */
6190                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6191                         }
6192                     }else{
6193                         /* p0', q0' */
6194                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6195                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6196                     }
6197                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6198                 }
6199                 pix++;
6200             }
6201     }
6202 }
6203
6204 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6205     int i;
6206     const int index_a = qp + h->slice_alpha_c0_offset;
6207     const int alpha = (alpha_table+52)[index_a];
6208     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6209
6210     if( bS[0] < 4 ) {
6211         int8_t tc[4];
6212         for(i=0; i<4; i++)
6213             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6214         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6215     } else {
6216         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6217     }
6218 }
6219
6220 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6221     MpegEncContext * const s = &h->s;
6222     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6223     int mb_xy, mb_type;
6224     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6225
6226     mb_xy = h->mb_xy;
6227
6228     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6229 1 ||
6230        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6231                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6232         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6233         return;
6234     }
6235     assert(!FRAME_MBAFF);
6236
6237     mb_type = s->current_picture.mb_type[mb_xy];
6238     qp = s->current_picture.qscale_table[mb_xy];
6239     qp0 = s->current_picture.qscale_table[mb_xy-1];
6240     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6241     qpc = get_chroma_qp( h, 0, qp );
6242     qpc0 = get_chroma_qp( h, 0, qp0 );
6243     qpc1 = get_chroma_qp( h, 0, qp1 );
6244     qp0 = (qp + qp0 + 1) >> 1;
6245     qp1 = (qp + qp1 + 1) >> 1;
6246     qpc0 = (qpc + qpc0 + 1) >> 1;
6247     qpc1 = (qpc + qpc1 + 1) >> 1;
6248     qp_thresh = 15 - h->slice_alpha_c0_offset;
6249     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6250        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6251         return;
6252
6253     if( IS_INTRA(mb_type) ) {
6254         int16_t bS4[4] = {4,4,4,4};
6255         int16_t bS3[4] = {3,3,3,3};
6256         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6257         if( IS_8x8DCT(mb_type) ) {
6258             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6259             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6260             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6261             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6262         } else {
6263             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6264             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6265             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6266             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6267             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6268             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6269             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6270             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6271         }
6272         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6273         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6274         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6275         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6276         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6277         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6278         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6279         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6280         return;
6281     } else {
6282         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6283         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6284         int edges;
6285         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6286             edges = 4;
6287             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6288         } else {
6289             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6290                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6291             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6292                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6293                              ? 3 : 0;
6294             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6295             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6296             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6297                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6298         }
6299         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6300             bSv[0][0] = 0x0004000400040004ULL;
6301         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6302             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6303
6304 #define FILTER(hv,dir,edge)\
6305         if(bSv[dir][edge]) {\
6306             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6307             if(!(edge&1)) {\
6308                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6309                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6310             }\
6311         }
6312         if( edges == 1 ) {
6313             FILTER(v,0,0);
6314             FILTER(h,1,0);
6315         } else if( IS_8x8DCT(mb_type) ) {
6316             FILTER(v,0,0);
6317             FILTER(v,0,2);
6318             FILTER(h,1,0);
6319             FILTER(h,1,2);
6320         } else {
6321             FILTER(v,0,0);
6322             FILTER(v,0,1);
6323             FILTER(v,0,2);
6324             FILTER(v,0,3);
6325             FILTER(h,1,0);
6326             FILTER(h,1,1);
6327             FILTER(h,1,2);
6328             FILTER(h,1,3);
6329         }
6330 #undef FILTER
6331     }
6332 }
6333
6334 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6335     MpegEncContext * const s = &h->s;
6336     const int mb_xy= mb_x + mb_y*s->mb_stride;
6337     const int mb_type = s->current_picture.mb_type[mb_xy];
6338     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6339     int first_vertical_edge_done = 0;
6340     int dir;
6341
6342     //for sufficiently low qp, filtering wouldn't do anything
6343     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6344     if(!FRAME_MBAFF){
6345         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6346         int qp = s->current_picture.qscale_table[mb_xy];
6347         if(qp <= qp_thresh
6348            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6349            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6350             return;
6351         }
6352     }
6353
6354     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6355     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6356         int top_type, left_type[2];
6357         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6358         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6359         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6360
6361         if(IS_8x8DCT(top_type)){
6362             h->non_zero_count_cache[4+8*0]=
6363             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6364             h->non_zero_count_cache[6+8*0]=
6365             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6366         }
6367         if(IS_8x8DCT(left_type[0])){
6368             h->non_zero_count_cache[3+8*1]=
6369             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6370         }
6371         if(IS_8x8DCT(left_type[1])){
6372             h->non_zero_count_cache[3+8*3]=
6373             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6374         }
6375
6376         if(IS_8x8DCT(mb_type)){
6377             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6378             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6379
6380             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6381             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6382
6383             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6384             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6385
6386             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6387             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6388         }
6389     }
6390
6391     if (FRAME_MBAFF
6392             // left mb is in picture
6393             && h->slice_table[mb_xy-1] != 255
6394             // and current and left pair do not have the same interlaced type
6395             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6396             // and left mb is in the same slice if deblocking_filter == 2
6397             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6398         /* First vertical edge is different in MBAFF frames
6399          * There are 8 different bS to compute and 2 different Qp
6400          */
6401         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6402         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6403         int16_t bS[8];
6404         int qp[2];
6405         int bqp[2];
6406         int rqp[2];
6407         int mb_qp, mbn0_qp, mbn1_qp;
6408         int i;
6409         first_vertical_edge_done = 1;
6410
6411         if( IS_INTRA(mb_type) )
6412             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6413         else {
6414             for( i = 0; i < 8; i++ ) {
6415                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6416
6417                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6418                     bS[i] = 4;
6419                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6420                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6421                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6422                     bS[i] = 2;
6423                 else
6424                     bS[i] = 1;
6425             }
6426         }
6427
6428         mb_qp = s->current_picture.qscale_table[mb_xy];
6429         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6430         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6431         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6432         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6433                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6434         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6435                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6436         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6437         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6438                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6439         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6440                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6441
6442         /* Filter edge */
6443         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6444         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6445         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6446         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6447         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6448     }
6449     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6450     for( dir = 0; dir < 2; dir++ )
6451     {
6452         int edge;
6453         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6454         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6455         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6456         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6457         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6458
6459         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6460                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6461         // how often to recheck mv-based bS when iterating between edges
6462         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6463                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6464         // how often to recheck mv-based bS when iterating along each edge
6465         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6466
6467         if (first_vertical_edge_done) {
6468             start = 1;
6469             first_vertical_edge_done = 0;
6470         }
6471
6472         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6473             start = 1;
6474
6475         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6476             && !IS_INTERLACED(mb_type)
6477             && IS_INTERLACED(mbm_type)
6478             ) {
6479             // This is a special case in the norm where the filtering must
6480             // be done twice (one each of the field) even if we are in a
6481             // frame macroblock.
6482             //
6483             static const int nnz_idx[4] = {4,5,6,3};
6484             unsigned int tmp_linesize   = 2 *   linesize;
6485             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6486             int mbn_xy = mb_xy - 2 * s->mb_stride;
6487             int qp;
6488             int i, j;
6489             int16_t bS[4];
6490
6491             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6492                 if( IS_INTRA(mb_type) ||
6493                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6494                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6495                 } else {
6496                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6497                     for( i = 0; i < 4; i++ ) {
6498                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6499                             mbn_nnz[nnz_idx[i]] != 0 )
6500                             bS[i] = 2;
6501                         else
6502                             bS[i] = 1;
6503                     }
6504                 }
6505                 // Do not use s->qscale as luma quantizer because it has not the same
6506                 // value in IPCM macroblocks.
6507                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6508                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6509                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6510                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6511                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6512                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6513                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6514                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6515             }
6516
6517             start = 1;
6518         }
6519
6520         /* Calculate bS */
6521         for( edge = start; edge < edges; edge++ ) {
6522             /* mbn_xy: neighbor macroblock */
6523             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6524             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6525             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6526             int16_t bS[4];
6527             int qp;
6528
6529             if( (edge&1) && IS_8x8DCT(mb_type) )
6530                 continue;
6531
6532             if( IS_INTRA(mb_type) ||
6533                 IS_INTRA(mbn_type) ) {
6534                 int value;
6535                 if (edge == 0) {
6536                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6537                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6538                     ) {
6539                         value = 4;
6540                     } else {
6541                         value = 3;
6542                     }
6543                 } else {
6544                     value = 3;
6545                 }
6546                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6547             } else {
6548                 int i, l;
6549                 int mv_done;
6550
6551                 if( edge & mask_edge ) {
6552                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6553                     mv_done = 1;
6554                 }
6555                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6556                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6557                     mv_done = 1;
6558                 }
6559                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6560                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6561                     int bn_idx= b_idx - (dir ? 8:1);
6562                     int v = 0;
6563
6564                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6565                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6566                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6567                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6568                     }
6569
6570                     if(h->slice_type_nos == FF_B_TYPE && v){
6571                         v=0;
6572                         for( l = 0; !v && l < 2; l++ ) {
6573                             int ln= 1-l;
6574                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6575                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6576                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6577                         }
6578                     }
6579
6580                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6581                     mv_done = 1;
6582                 }
6583                 else
6584                     mv_done = 0;
6585
6586                 for( i = 0; i < 4; i++ ) {
6587                     int x = dir == 0 ? edge : i;
6588                     int y = dir == 0 ? i    : edge;
6589                     int b_idx= 8 + 4 + x + 8*y;
6590                     int bn_idx= b_idx - (dir ? 8:1);
6591
6592                     if( h->non_zero_count_cache[b_idx] != 0 ||
6593                         h->non_zero_count_cache[bn_idx] != 0 ) {
6594                         bS[i] = 2;
6595                     }
6596                     else if(!mv_done)
6597                     {
6598                         bS[i] = 0;
6599                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6600                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6601                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6602                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6603                                 bS[i] = 1;
6604                                 break;
6605                             }
6606                         }
6607
6608                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6609                             bS[i] = 0;
6610                             for( l = 0; l < 2; l++ ) {
6611                                 int ln= 1-l;
6612                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6613                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6614                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6615                                     bS[i] = 1;
6616                                     break;
6617                                 }
6618                             }
6619                         }
6620                     }
6621                 }
6622
6623                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6624                     continue;
6625             }
6626
6627             /* Filter edge */
6628             // Do not use s->qscale as luma quantizer because it has not the same
6629             // value in IPCM macroblocks.
6630             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6631             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6632             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6633             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6634             if( dir == 0 ) {
6635                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6636                 if( (edge&1) == 0 ) {
6637                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6638                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6639                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6640                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6641                 }
6642             } else {
6643                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6644                 if( (edge&1) == 0 ) {
6645                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6646                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6647                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6648                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6649                 }
6650             }
6651         }
6652     }
6653 }
6654
6655 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6656     MpegEncContext * const s = &h->s;
6657     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6658
6659     s->mb_skip_run= -1;
6660
6661     if( h->pps.cabac ) {
6662         int i;
6663
6664         /* realign */
6665         align_get_bits( &s->gb );
6666
6667         /* init cabac */
6668         ff_init_cabac_states( &h->cabac);
6669         ff_init_cabac_decoder( &h->cabac,
6670                                s->gb.buffer + get_bits_count(&s->gb)/8,
6671                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6672         /* calculate pre-state */
6673         for( i= 0; i < 460; i++ ) {
6674             int pre;
6675             if( h->slice_type_nos == FF_I_TYPE )
6676                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6677             else
6678                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6679
6680             if( pre <= 63 )
6681                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6682             else
6683                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6684         }
6685
6686         for(;;){
6687 //START_TIMER
6688             int ret = decode_mb_cabac(h);
6689             int eos;
6690 //STOP_TIMER("decode_mb_cabac")
6691
6692             if(ret>=0) hl_decode_mb(h);
6693
6694             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6695                 s->mb_y++;
6696
6697                 if(ret>=0) ret = decode_mb_cabac(h);
6698
6699                 if(ret>=0) hl_decode_mb(h);
6700                 s->mb_y--;
6701             }
6702             eos = get_cabac_terminate( &h->cabac );
6703
6704             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6705                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6706                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6707                 return -1;
6708             }
6709
6710             if( ++s->mb_x >= s->mb_width ) {
6711                 s->mb_x = 0;
6712                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6713                 ++s->mb_y;
6714                 if(FIELD_OR_MBAFF_PICTURE) {
6715                     ++s->mb_y;
6716                 }
6717             }
6718
6719             if( eos || s->mb_y >= s->mb_height ) {
6720                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6721                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6722                 return 0;
6723             }
6724         }
6725
6726     } else {
6727         for(;;){
6728             int ret = decode_mb_cavlc(h);
6729
6730             if(ret>=0) hl_decode_mb(h);
6731
6732             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6733                 s->mb_y++;
6734                 ret = decode_mb_cavlc(h);
6735
6736                 if(ret>=0) hl_decode_mb(h);
6737                 s->mb_y--;
6738             }
6739
6740             if(ret<0){
6741                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6742                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6743
6744                 return -1;
6745             }
6746
6747             if(++s->mb_x >= s->mb_width){
6748                 s->mb_x=0;
6749                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6750                 ++s->mb_y;
6751                 if(FIELD_OR_MBAFF_PICTURE) {
6752                     ++s->mb_y;
6753                 }
6754                 if(s->mb_y >= s->mb_height){
6755                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6756
6757                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6758                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6759
6760                         return 0;
6761                     }else{
6762                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6763
6764                         return -1;
6765                     }
6766                 }
6767             }
6768
6769             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6770                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6771                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6772                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6773
6774                     return 0;
6775                 }else{
6776                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6777
6778                     return -1;
6779                 }
6780             }
6781         }
6782     }
6783
6784 #if 0
6785     for(;s->mb_y < s->mb_height; s->mb_y++){
6786         for(;s->mb_x < s->mb_width; s->mb_x++){
6787             int ret= decode_mb(h);
6788
6789             hl_decode_mb(h);
6790
6791             if(ret<0){
6792                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6793                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6794
6795                 return -1;
6796             }
6797
6798             if(++s->mb_x >= s->mb_width){
6799                 s->mb_x=0;
6800                 if(++s->mb_y >= s->mb_height){
6801                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6802                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6803
6804                         return 0;
6805                     }else{
6806                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6807
6808                         return -1;
6809                     }
6810                 }
6811             }
6812
6813             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6814                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6815                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6816
6817                     return 0;
6818                 }else{
6819                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6820
6821                     return -1;
6822                 }
6823             }
6824         }
6825         s->mb_x=0;
6826         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6827     }
6828 #endif
6829     return -1; //not reached
6830 }
6831
6832 static int decode_unregistered_user_data(H264Context *h, int size){
6833     MpegEncContext * const s = &h->s;
6834     uint8_t user_data[16+256];
6835     int e, build, i;
6836
6837     if(size<16)
6838         return -1;
6839
6840     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6841         user_data[i]= get_bits(&s->gb, 8);
6842     }
6843
6844     user_data[i]= 0;
6845     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6846     if(e==1 && build>=0)
6847         h->x264_build= build;
6848
6849     if(s->avctx->debug & FF_DEBUG_BUGS)
6850         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6851
6852     for(; i<size; i++)
6853         skip_bits(&s->gb, 8);
6854
6855     return 0;
6856 }
6857
6858 static int decode_sei(H264Context *h){
6859     MpegEncContext * const s = &h->s;
6860
6861     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6862         int size, type;
6863
6864         type=0;
6865         do{
6866             type+= show_bits(&s->gb, 8);
6867         }while(get_bits(&s->gb, 8) == 255);
6868
6869         size=0;
6870         do{
6871             size+= show_bits(&s->gb, 8);
6872         }while(get_bits(&s->gb, 8) == 255);
6873
6874         switch(type){
6875         case 5:
6876             if(decode_unregistered_user_data(h, size) < 0)
6877                 return -1;
6878             break;
6879         default:
6880             skip_bits(&s->gb, 8*size);
6881         }
6882
6883         //FIXME check bits here
6884         align_get_bits(&s->gb);
6885     }
6886
6887     return 0;
6888 }
6889
6890 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6891     MpegEncContext * const s = &h->s;
6892     int cpb_count, i;
6893     cpb_count = get_ue_golomb(&s->gb) + 1;
6894     get_bits(&s->gb, 4); /* bit_rate_scale */
6895     get_bits(&s->gb, 4); /* cpb_size_scale */
6896     for(i=0; i<cpb_count; i++){
6897         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6898         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6899         get_bits1(&s->gb);     /* cbr_flag */
6900     }
6901     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6902     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6903     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6904     get_bits(&s->gb, 5); /* time_offset_length */
6905 }
6906
6907 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6908     MpegEncContext * const s = &h->s;
6909     int aspect_ratio_info_present_flag;
6910     unsigned int aspect_ratio_idc;
6911     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6912
6913     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6914
6915     if( aspect_ratio_info_present_flag ) {
6916         aspect_ratio_idc= get_bits(&s->gb, 8);
6917         if( aspect_ratio_idc == EXTENDED_SAR ) {
6918             sps->sar.num= get_bits(&s->gb, 16);
6919             sps->sar.den= get_bits(&s->gb, 16);
6920         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
6921             sps->sar=  pixel_aspect[aspect_ratio_idc];
6922         }else{
6923             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6924             return -1;
6925         }
6926     }else{
6927         sps->sar.num=
6928         sps->sar.den= 0;
6929     }
6930 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6931
6932     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6933         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6934     }
6935
6936     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6937         get_bits(&s->gb, 3);    /* video_format */
6938         get_bits1(&s->gb);      /* video_full_range_flag */
6939         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6940             get_bits(&s->gb, 8); /* colour_primaries */
6941             get_bits(&s->gb, 8); /* transfer_characteristics */
6942             get_bits(&s->gb, 8); /* matrix_coefficients */
6943         }
6944     }
6945
6946     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6947         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6948         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6949     }
6950
6951     sps->timing_info_present_flag = get_bits1(&s->gb);
6952     if(sps->timing_info_present_flag){
6953         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6954         sps->time_scale = get_bits_long(&s->gb, 32);
6955         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6956     }
6957
6958     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6959     if(nal_hrd_parameters_present_flag)
6960         decode_hrd_parameters(h, sps);
6961     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6962     if(vcl_hrd_parameters_present_flag)
6963         decode_hrd_parameters(h, sps);
6964     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6965         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6966     get_bits1(&s->gb);         /* pic_struct_present_flag */
6967
6968     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6969     if(sps->bitstream_restriction_flag){
6970         unsigned int num_reorder_frames;
6971         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6972         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6973         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6974         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6975         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6976         num_reorder_frames= get_ue_golomb(&s->gb);
6977         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6978
6979         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6980             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6981             return -1;
6982         }
6983
6984         sps->num_reorder_frames= num_reorder_frames;
6985     }
6986
6987     return 0;
6988 }
6989
6990 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6991                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6992     MpegEncContext * const s = &h->s;
6993     int i, last = 8, next = 8;
6994     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6995     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6996         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6997     else
6998     for(i=0;i<size;i++){
6999         if(next)
7000             next = (last + get_se_golomb(&s->gb)) & 0xff;
7001         if(!i && !next){ /* matrix not written, we use the preset one */
7002             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7003             break;
7004         }
7005         last = factors[scan[i]] = next ? next : last;
7006     }
7007 }
7008
7009 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7010                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7011     MpegEncContext * const s = &h->s;
7012     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7013     const uint8_t *fallback[4] = {
7014         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7015         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7016         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7017         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7018     };
7019     if(get_bits1(&s->gb)){
7020         sps->scaling_matrix_present |= is_sps;
7021         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7022         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7023         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7024         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7025         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7026         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7027         if(is_sps || pps->transform_8x8_mode){
7028             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7029             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7030         }
7031     } else if(fallback_sps) {
7032         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7033         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7034     }
7035 }
7036
7037 /**
7038  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7039  */
7040 static void *
7041 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7042                     const size_t size, const char *name)
7043 {
7044     if(id>=max) {
7045         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7046         return NULL;
7047     }
7048
7049     if(!vec[id]) {
7050         vec[id] = av_mallocz(size);
7051         if(vec[id] == NULL)
7052             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7053     }
7054     return vec[id];
7055 }
7056
7057 static inline int decode_seq_parameter_set(H264Context *h){
7058     MpegEncContext * const s = &h->s;
7059     int profile_idc, level_idc;
7060     unsigned int sps_id, tmp, mb_width, mb_height;
7061     int i;
7062     SPS *sps;
7063
7064     profile_idc= get_bits(&s->gb, 8);
7065     get_bits1(&s->gb);   //constraint_set0_flag
7066     get_bits1(&s->gb);   //constraint_set1_flag
7067     get_bits1(&s->gb);   //constraint_set2_flag
7068     get_bits1(&s->gb);   //constraint_set3_flag
7069     get_bits(&s->gb, 4); // reserved
7070     level_idc= get_bits(&s->gb, 8);
7071     sps_id= get_ue_golomb(&s->gb);
7072
7073     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7074     if(sps == NULL)
7075         return -1;
7076
7077     sps->profile_idc= profile_idc;
7078     sps->level_idc= level_idc;
7079
7080     if(sps->profile_idc >= 100){ //high profile
7081         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7082         if(sps->chroma_format_idc == 3)
7083             get_bits1(&s->gb);  //residual_color_transform_flag
7084         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7085         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7086         sps->transform_bypass = get_bits1(&s->gb);
7087         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7088     }else{
7089         sps->scaling_matrix_present = 0;
7090         sps->chroma_format_idc= 1;
7091     }
7092
7093     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7094     sps->poc_type= get_ue_golomb(&s->gb);
7095
7096     if(sps->poc_type == 0){ //FIXME #define
7097         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7098     } else if(sps->poc_type == 1){//FIXME #define
7099         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7100         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7101         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7102         tmp= get_ue_golomb(&s->gb);
7103
7104         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7105             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7106             return -1;
7107         }
7108         sps->poc_cycle_length= tmp;
7109
7110         for(i=0; i<sps->poc_cycle_length; i++)
7111             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7112     }else if(sps->poc_type != 2){
7113         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7114         return -1;
7115     }
7116
7117     tmp= get_ue_golomb(&s->gb);
7118     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7119         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7120         return -1;
7121     }
7122     sps->ref_frame_count= tmp;
7123     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7124     mb_width= get_ue_golomb(&s->gb) + 1;
7125     mb_height= get_ue_golomb(&s->gb) + 1;
7126     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7127        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7128         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7129         return -1;
7130     }
7131     sps->mb_width = mb_width;
7132     sps->mb_height= mb_height;
7133
7134     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7135     if(!sps->frame_mbs_only_flag)
7136         sps->mb_aff= get_bits1(&s->gb);
7137     else
7138         sps->mb_aff= 0;
7139
7140     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7141
7142 #ifndef ALLOW_INTERLACE
7143     if(sps->mb_aff)
7144         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7145 #endif
7146     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7147         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7148
7149     sps->crop= get_bits1(&s->gb);
7150     if(sps->crop){
7151         sps->crop_left  = get_ue_golomb(&s->gb);
7152         sps->crop_right = get_ue_golomb(&s->gb);
7153         sps->crop_top   = get_ue_golomb(&s->gb);
7154         sps->crop_bottom= get_ue_golomb(&s->gb);
7155         if(sps->crop_left || sps->crop_top){
7156             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7157         }
7158         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7159             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7160         }
7161     }else{
7162         sps->crop_left  =
7163         sps->crop_right =
7164         sps->crop_top   =
7165         sps->crop_bottom= 0;
7166     }
7167
7168     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7169     if( sps->vui_parameters_present_flag )
7170         decode_vui_parameters(h, sps);
7171
7172     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7173         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7174                sps_id, sps->profile_idc, sps->level_idc,
7175                sps->poc_type,
7176                sps->ref_frame_count,
7177                sps->mb_width, sps->mb_height,
7178                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7179                sps->direct_8x8_inference_flag ? "8B8" : "",
7180                sps->crop_left, sps->crop_right,
7181                sps->crop_top, sps->crop_bottom,
7182                sps->vui_parameters_present_flag ? "VUI" : "",
7183                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7184                );
7185     }
7186     return 0;
7187 }
7188
7189 static void
7190 build_qp_table(PPS *pps, int t, int index)
7191 {
7192     int i;
7193     for(i = 0; i < 52; i++)
7194         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7195 }
7196
7197 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7198     MpegEncContext * const s = &h->s;
7199     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7200     PPS *pps;
7201
7202     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7203     if(pps == NULL)
7204         return -1;
7205
7206     tmp= get_ue_golomb(&s->gb);
7207     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7208         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7209         return -1;
7210     }
7211     pps->sps_id= tmp;
7212
7213     pps->cabac= get_bits1(&s->gb);
7214     pps->pic_order_present= get_bits1(&s->gb);
7215     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7216     if(pps->slice_group_count > 1 ){
7217         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7218         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7219         switch(pps->mb_slice_group_map_type){
7220         case 0:
7221 #if 0
7222 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7223 |    run_length[ i ]                                |1  |ue(v)   |
7224 #endif
7225             break;
7226         case 2:
7227 #if 0
7228 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7229 |{                                                  |   |        |
7230 |    top_left_mb[ i ]                               |1  |ue(v)   |
7231 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7232 |   }                                               |   |        |
7233 #endif
7234             break;
7235         case 3:
7236         case 4:
7237         case 5:
7238 #if 0
7239 |   slice_group_change_direction_flag               |1  |u(1)    |
7240 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7241 #endif
7242             break;
7243         case 6:
7244 #if 0
7245 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7246 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7247 |)                                                  |   |        |
7248 |    slice_group_id[ i ]                            |1  |u(v)    |
7249 #endif
7250             break;
7251         }
7252     }
7253     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7254     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7255     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7256         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7257         pps->ref_count[0]= pps->ref_count[1]= 1;
7258         return -1;
7259     }
7260
7261     pps->weighted_pred= get_bits1(&s->gb);
7262     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7263     pps->init_qp= get_se_golomb(&s->gb) + 26;
7264     pps->init_qs= get_se_golomb(&s->gb) + 26;
7265     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7266     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7267     pps->constrained_intra_pred= get_bits1(&s->gb);
7268     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7269
7270     pps->transform_8x8_mode= 0;
7271     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7272     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7273     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7274
7275     if(get_bits_count(&s->gb) < bit_length){
7276         pps->transform_8x8_mode= get_bits1(&s->gb);
7277         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7278         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7279     } else {
7280         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7281     }
7282
7283     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7284     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7285     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7286         h->pps.chroma_qp_diff= 1;
7287
7288     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7289         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7290                pps_id, pps->sps_id,
7291                pps->cabac ? "CABAC" : "CAVLC",
7292                pps->slice_group_count,
7293                pps->ref_count[0], pps->ref_count[1],
7294                pps->weighted_pred ? "weighted" : "",
7295                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7296                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7297                pps->constrained_intra_pred ? "CONSTR" : "",
7298                pps->redundant_pic_cnt_present ? "REDU" : "",
7299                pps->transform_8x8_mode ? "8x8DCT" : ""
7300                );
7301     }
7302
7303     return 0;
7304 }
7305
7306 /**
7307  * Call decode_slice() for each context.
7308  *
7309  * @param h h264 master context
7310  * @param context_count number of contexts to execute
7311  */
7312 static void execute_decode_slices(H264Context *h, int context_count){
7313     MpegEncContext * const s = &h->s;
7314     AVCodecContext * const avctx= s->avctx;
7315     H264Context *hx;
7316     int i;
7317
7318     if(context_count == 1) {
7319         decode_slice(avctx, h);
7320     } else {
7321         for(i = 1; i < context_count; i++) {
7322             hx = h->thread_context[i];
7323             hx->s.error_resilience = avctx->error_resilience;
7324             hx->s.error_count = 0;
7325         }
7326
7327         avctx->execute(avctx, (void *)decode_slice,
7328                        (void **)h->thread_context, NULL, context_count);
7329
7330         /* pull back stuff from slices to master context */
7331         hx = h->thread_context[context_count - 1];
7332         s->mb_x = hx->s.mb_x;
7333         s->mb_y = hx->s.mb_y;
7334         s->dropable = hx->s.dropable;
7335         s->picture_structure = hx->s.picture_structure;
7336         for(i = 1; i < context_count; i++)
7337             h->s.error_count += h->thread_context[i]->s.error_count;
7338     }
7339 }
7340
7341
7342 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7343     MpegEncContext * const s = &h->s;
7344     AVCodecContext * const avctx= s->avctx;
7345     int buf_index=0;
7346     H264Context *hx; ///< thread context
7347     int context_count = 0;
7348
7349     h->max_contexts = avctx->thread_count;
7350 #if 0
7351     int i;
7352     for(i=0; i<50; i++){
7353         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7354     }
7355 #endif
7356     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7357         h->current_slice = 0;
7358         if (!s->first_field)
7359             s->current_picture_ptr= NULL;
7360     }
7361
7362     for(;;){
7363         int consumed;
7364         int dst_length;
7365         int bit_length;
7366         const uint8_t *ptr;
7367         int i, nalsize = 0;
7368         int err;
7369
7370         if(h->is_avc) {
7371             if(buf_index >= buf_size) break;
7372             nalsize = 0;
7373             for(i = 0; i < h->nal_length_size; i++)
7374                 nalsize = (nalsize << 8) | buf[buf_index++];
7375             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7376                 if(nalsize == 1){
7377                     buf_index++;
7378                     continue;
7379                 }else{
7380                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7381                     break;
7382                 }
7383             }
7384         } else {
7385             // start code prefix search
7386             for(; buf_index + 3 < buf_size; buf_index++){
7387                 // This should always succeed in the first iteration.
7388                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7389                     break;
7390             }
7391
7392             if(buf_index+3 >= buf_size) break;
7393
7394             buf_index+=3;
7395         }
7396
7397         hx = h->thread_context[context_count];
7398
7399         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7400         if (ptr==NULL || dst_length < 0){
7401             return -1;
7402         }
7403         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7404             dst_length--;
7405         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7406
7407         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7408             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7409         }
7410
7411         if (h->is_avc && (nalsize != consumed)){
7412             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7413             consumed= nalsize;
7414         }
7415
7416         buf_index += consumed;
7417
7418         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7419            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7420             continue;
7421
7422       again:
7423         err = 0;
7424         switch(hx->nal_unit_type){
7425         case NAL_IDR_SLICE:
7426             if (h->nal_unit_type != NAL_IDR_SLICE) {
7427                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7428                 return -1;
7429             }
7430             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7431         case NAL_SLICE:
7432             init_get_bits(&hx->s.gb, ptr, bit_length);
7433             hx->intra_gb_ptr=
7434             hx->inter_gb_ptr= &hx->s.gb;
7435             hx->s.data_partitioning = 0;
7436
7437             if((err = decode_slice_header(hx, h)))
7438                break;
7439
7440             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7441             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7442                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7443                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7444                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7445                && avctx->skip_frame < AVDISCARD_ALL)
7446                 context_count++;
7447             break;
7448         case NAL_DPA:
7449             init_get_bits(&hx->s.gb, ptr, bit_length);
7450             hx->intra_gb_ptr=
7451             hx->inter_gb_ptr= NULL;
7452             hx->s.data_partitioning = 1;
7453
7454             err = decode_slice_header(hx, h);
7455             break;
7456         case NAL_DPB:
7457             init_get_bits(&hx->intra_gb, ptr, bit_length);
7458             hx->intra_gb_ptr= &hx->intra_gb;
7459             break;
7460         case NAL_DPC:
7461             init_get_bits(&hx->inter_gb, ptr, bit_length);
7462             hx->inter_gb_ptr= &hx->inter_gb;
7463
7464             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7465                && s->context_initialized
7466                && s->hurry_up < 5
7467                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7468                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7469                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7470                && avctx->skip_frame < AVDISCARD_ALL)
7471                 context_count++;
7472             break;
7473         case NAL_SEI:
7474             init_get_bits(&s->gb, ptr, bit_length);
7475             decode_sei(h);
7476             break;
7477         case NAL_SPS:
7478             init_get_bits(&s->gb, ptr, bit_length);
7479             decode_seq_parameter_set(h);
7480
7481             if(s->flags& CODEC_FLAG_LOW_DELAY)
7482                 s->low_delay=1;
7483
7484             if(avctx->has_b_frames < 2)
7485                 avctx->has_b_frames= !s->low_delay;
7486             break;
7487         case NAL_PPS:
7488             init_get_bits(&s->gb, ptr, bit_length);
7489
7490             decode_picture_parameter_set(h, bit_length);
7491
7492             break;
7493         case NAL_AUD:
7494         case NAL_END_SEQUENCE:
7495         case NAL_END_STREAM:
7496         case NAL_FILLER_DATA:
7497         case NAL_SPS_EXT:
7498         case NAL_AUXILIARY_SLICE:
7499             break;
7500         default:
7501             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7502         }
7503
7504         if(context_count == h->max_contexts) {
7505             execute_decode_slices(h, context_count);
7506             context_count = 0;
7507         }
7508
7509         if (err < 0)
7510             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7511         else if(err == 1) {
7512             /* Slice could not be decoded in parallel mode, copy down
7513              * NAL unit stuff to context 0 and restart. Note that
7514              * rbsp_buffer is not transferred, but since we no longer
7515              * run in parallel mode this should not be an issue. */
7516             h->nal_unit_type = hx->nal_unit_type;
7517             h->nal_ref_idc   = hx->nal_ref_idc;
7518             hx = h;
7519             goto again;
7520         }
7521     }
7522     if(context_count)
7523         execute_decode_slices(h, context_count);
7524     return buf_index;
7525 }
7526
7527 /**
7528  * returns the number of bytes consumed for building the current frame
7529  */
7530 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7531         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7532         if(pos+10>buf_size) pos=buf_size; // oops ;)
7533
7534         return pos;
7535 }
7536
7537 static int decode_frame(AVCodecContext *avctx,
7538                              void *data, int *data_size,
7539                              const uint8_t *buf, int buf_size)
7540 {
7541     H264Context *h = avctx->priv_data;
7542     MpegEncContext *s = &h->s;
7543     AVFrame *pict = data;
7544     int buf_index;
7545
7546     s->flags= avctx->flags;
7547     s->flags2= avctx->flags2;
7548
7549    /* end of stream, output what is still in the buffers */
7550     if (buf_size == 0) {
7551         Picture *out;
7552         int i, out_idx;
7553
7554 //FIXME factorize this with the output code below
7555         out = h->delayed_pic[0];
7556         out_idx = 0;
7557         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7558             if(h->delayed_pic[i]->poc < out->poc){
7559                 out = h->delayed_pic[i];
7560                 out_idx = i;
7561             }
7562
7563         for(i=out_idx; h->delayed_pic[i]; i++)
7564             h->delayed_pic[i] = h->delayed_pic[i+1];
7565
7566         if(out){
7567             *data_size = sizeof(AVFrame);
7568             *pict= *(AVFrame*)out;
7569         }
7570
7571         return 0;
7572     }
7573
7574     if(h->is_avc && !h->got_avcC) {
7575         int i, cnt, nalsize;
7576         unsigned char *p = avctx->extradata;
7577         if(avctx->extradata_size < 7) {
7578             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7579             return -1;
7580         }
7581         if(*p != 1) {
7582             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7583             return -1;
7584         }
7585         /* sps and pps in the avcC always have length coded with 2 bytes,
7586            so put a fake nal_length_size = 2 while parsing them */
7587         h->nal_length_size = 2;
7588         // Decode sps from avcC
7589         cnt = *(p+5) & 0x1f; // Number of sps
7590         p += 6;
7591         for (i = 0; i < cnt; i++) {
7592             nalsize = AV_RB16(p) + 2;
7593             if(decode_nal_units(h, p, nalsize) < 0) {
7594                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7595                 return -1;
7596             }
7597             p += nalsize;
7598         }
7599         // Decode pps from avcC
7600         cnt = *(p++); // Number of pps
7601         for (i = 0; i < cnt; i++) {
7602             nalsize = AV_RB16(p) + 2;
7603             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7604                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7605                 return -1;
7606             }
7607             p += nalsize;
7608         }
7609         // Now store right nal length size, that will be use to parse all other nals
7610         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7611         // Do not reparse avcC
7612         h->got_avcC = 1;
7613     }
7614
7615     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7616         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7617             return -1;
7618     }
7619
7620     buf_index=decode_nal_units(h, buf, buf_size);
7621     if(buf_index < 0)
7622         return -1;
7623
7624     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7625         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7626         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7627         return -1;
7628     }
7629
7630     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7631         Picture *out = s->current_picture_ptr;
7632         Picture *cur = s->current_picture_ptr;
7633         int i, pics, cross_idr, out_of_order, out_idx;
7634
7635         s->mb_y= 0;
7636
7637         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7638         s->current_picture_ptr->pict_type= s->pict_type;
7639
7640         if(!s->dropable) {
7641             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7642             h->prev_poc_msb= h->poc_msb;
7643             h->prev_poc_lsb= h->poc_lsb;
7644         }
7645         h->prev_frame_num_offset= h->frame_num_offset;
7646         h->prev_frame_num= h->frame_num;
7647
7648         /*
7649          * FIXME: Error handling code does not seem to support interlaced
7650          * when slices span multiple rows
7651          * The ff_er_add_slice calls don't work right for bottom
7652          * fields; they cause massive erroneous error concealing
7653          * Error marking covers both fields (top and bottom).
7654          * This causes a mismatched s->error_count
7655          * and a bad error table. Further, the error count goes to
7656          * INT_MAX when called for bottom field, because mb_y is
7657          * past end by one (callers fault) and resync_mb_y != 0
7658          * causes problems for the first MB line, too.
7659          */
7660         if (!FIELD_PICTURE)
7661             ff_er_frame_end(s);
7662
7663         MPV_frame_end(s);
7664
7665         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7666             /* Wait for second field. */
7667             *data_size = 0;
7668
7669         } else {
7670             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7671             /* Derive top_field_first from field pocs. */
7672             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7673
7674         //FIXME do something with unavailable reference frames
7675
7676             /* Sort B-frames into display order */
7677
7678             if(h->sps.bitstream_restriction_flag
7679                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7680                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7681                 s->low_delay = 0;
7682             }
7683
7684             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7685                && !h->sps.bitstream_restriction_flag){
7686                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7687                 s->low_delay= 0;
7688             }
7689
7690             pics = 0;
7691             while(h->delayed_pic[pics]) pics++;
7692
7693             assert(pics <= MAX_DELAYED_PIC_COUNT);
7694
7695             h->delayed_pic[pics++] = cur;
7696             if(cur->reference == 0)
7697                 cur->reference = DELAYED_PIC_REF;
7698
7699             out = h->delayed_pic[0];
7700             out_idx = 0;
7701             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7702                 if(h->delayed_pic[i]->poc < out->poc){
7703                     out = h->delayed_pic[i];
7704                     out_idx = i;
7705                 }
7706             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7707
7708             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7709
7710             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7711                 { }
7712             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7713                || (s->low_delay &&
7714                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7715                  || cur->pict_type == FF_B_TYPE)))
7716             {
7717                 s->low_delay = 0;
7718                 s->avctx->has_b_frames++;
7719             }
7720
7721             if(out_of_order || pics > s->avctx->has_b_frames){
7722                 out->reference &= ~DELAYED_PIC_REF;
7723                 for(i=out_idx; h->delayed_pic[i]; i++)
7724                     h->delayed_pic[i] = h->delayed_pic[i+1];
7725             }
7726             if(!out_of_order && pics > s->avctx->has_b_frames){
7727                 *data_size = sizeof(AVFrame);
7728
7729                 h->outputed_poc = out->poc;
7730                 *pict= *(AVFrame*)out;
7731             }else{
7732                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7733             }
7734         }
7735     }
7736
7737     assert(pict->data[0] || !*data_size);
7738     ff_print_debug_info(s, pict);
7739 //printf("out %d\n", (int)pict->data[0]);
7740 #if 0 //?
7741
7742     /* Return the Picture timestamp as the frame number */
7743     /* we subtract 1 because it is added on utils.c     */
7744     avctx->frame_number = s->picture_number - 1;
7745 #endif
7746     return get_consumed_bytes(s, buf_index, buf_size);
7747 }
7748 #if 0
7749 static inline void fill_mb_avail(H264Context *h){
7750     MpegEncContext * const s = &h->s;
7751     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7752
7753     if(s->mb_y){
7754         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7755         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7756         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7757     }else{
7758         h->mb_avail[0]=
7759         h->mb_avail[1]=
7760         h->mb_avail[2]= 0;
7761     }
7762     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7763     h->mb_avail[4]= 1; //FIXME move out
7764     h->mb_avail[5]= 0; //FIXME move out
7765 }
7766 #endif
7767
7768 #ifdef TEST
7769 #undef printf
7770 #undef random
7771 #define COUNT 8000
7772 #define SIZE (COUNT*40)
7773 int main(void){
7774     int i;
7775     uint8_t temp[SIZE];
7776     PutBitContext pb;
7777     GetBitContext gb;
7778 //    int int_temp[10000];
7779     DSPContext dsp;
7780     AVCodecContext avctx;
7781
7782     dsputil_init(&dsp, &avctx);
7783
7784     init_put_bits(&pb, temp, SIZE);
7785     printf("testing unsigned exp golomb\n");
7786     for(i=0; i<COUNT; i++){
7787         START_TIMER
7788         set_ue_golomb(&pb, i);
7789         STOP_TIMER("set_ue_golomb");
7790     }
7791     flush_put_bits(&pb);
7792
7793     init_get_bits(&gb, temp, 8*SIZE);
7794     for(i=0; i<COUNT; i++){
7795         int j, s;
7796
7797         s= show_bits(&gb, 24);
7798
7799         START_TIMER
7800         j= get_ue_golomb(&gb);
7801         if(j != i){
7802             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7803 //            return -1;
7804         }
7805         STOP_TIMER("get_ue_golomb");
7806     }
7807
7808
7809     init_put_bits(&pb, temp, SIZE);
7810     printf("testing signed exp golomb\n");
7811     for(i=0; i<COUNT; i++){
7812         START_TIMER
7813         set_se_golomb(&pb, i - COUNT/2);
7814         STOP_TIMER("set_se_golomb");
7815     }
7816     flush_put_bits(&pb);
7817
7818     init_get_bits(&gb, temp, 8*SIZE);
7819     for(i=0; i<COUNT; i++){
7820         int j, s;
7821
7822         s= show_bits(&gb, 24);
7823
7824         START_TIMER
7825         j= get_se_golomb(&gb);
7826         if(j != i - COUNT/2){
7827             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7828 //            return -1;
7829         }
7830         STOP_TIMER("get_se_golomb");
7831     }
7832
7833 #if 0
7834     printf("testing 4x4 (I)DCT\n");
7835
7836     DCTELEM block[16];
7837     uint8_t src[16], ref[16];
7838     uint64_t error= 0, max_error=0;
7839
7840     for(i=0; i<COUNT; i++){
7841         int j;
7842 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7843         for(j=0; j<16; j++){
7844             ref[j]= random()%255;
7845             src[j]= random()%255;
7846         }
7847
7848         h264_diff_dct_c(block, src, ref, 4);
7849
7850         //normalize
7851         for(j=0; j<16; j++){
7852 //            printf("%d ", block[j]);
7853             block[j]= block[j]*4;
7854             if(j&1) block[j]= (block[j]*4 + 2)/5;
7855             if(j&4) block[j]= (block[j]*4 + 2)/5;
7856         }
7857 //        printf("\n");
7858
7859         s->dsp.h264_idct_add(ref, block, 4);
7860 /*        for(j=0; j<16; j++){
7861             printf("%d ", ref[j]);
7862         }
7863         printf("\n");*/
7864
7865         for(j=0; j<16; j++){
7866             int diff= FFABS(src[j] - ref[j]);
7867
7868             error+= diff*diff;
7869             max_error= FFMAX(max_error, diff);
7870         }
7871     }
7872     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7873     printf("testing quantizer\n");
7874     for(qp=0; qp<52; qp++){
7875         for(i=0; i<16; i++)
7876             src1_block[i]= src2_block[i]= random()%255;
7877
7878     }
7879     printf("Testing NAL layer\n");
7880
7881     uint8_t bitstream[COUNT];
7882     uint8_t nal[COUNT*2];
7883     H264Context h;
7884     memset(&h, 0, sizeof(H264Context));
7885
7886     for(i=0; i<COUNT; i++){
7887         int zeros= i;
7888         int nal_length;
7889         int consumed;
7890         int out_length;
7891         uint8_t *out;
7892         int j;
7893
7894         for(j=0; j<COUNT; j++){
7895             bitstream[j]= (random() % 255) + 1;
7896         }
7897
7898         for(j=0; j<zeros; j++){
7899             int pos= random() % COUNT;
7900             while(bitstream[pos] == 0){
7901                 pos++;
7902                 pos %= COUNT;
7903             }
7904             bitstream[pos]=0;
7905         }
7906
7907         START_TIMER
7908
7909         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7910         if(nal_length<0){
7911             printf("encoding failed\n");
7912             return -1;
7913         }
7914
7915         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7916
7917         STOP_TIMER("NAL")
7918
7919         if(out_length != COUNT){
7920             printf("incorrect length %d %d\n", out_length, COUNT);
7921             return -1;
7922         }
7923
7924         if(consumed != nal_length){
7925             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7926             return -1;
7927         }
7928
7929         if(memcmp(bitstream, out, COUNT)){
7930             printf("mismatch\n");
7931             return -1;
7932         }
7933     }
7934 #endif
7935
7936     printf("Testing RBSP\n");
7937
7938
7939     return 0;
7940 }
7941 #endif /* TEST */
7942
7943
7944 static av_cold int decode_end(AVCodecContext *avctx)
7945 {
7946     H264Context *h = avctx->priv_data;
7947     MpegEncContext *s = &h->s;
7948
7949     av_freep(&h->rbsp_buffer[0]);
7950     av_freep(&h->rbsp_buffer[1]);
7951     free_tables(h); //FIXME cleanup init stuff perhaps
7952     MPV_common_end(s);
7953
7954 //    memset(h, 0, sizeof(H264Context));
7955
7956     return 0;
7957 }
7958
7959
7960 AVCodec h264_decoder = {
7961     "h264",
7962     CODEC_TYPE_VIDEO,
7963     CODEC_ID_H264,
7964     sizeof(H264Context),
7965     decode_init,
7966     NULL,
7967     decode_end,
7968     decode_frame,
7969     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7970     .flush= flush_dpb,
7971     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7972 };
7973
7974 #include "svq3.c"