git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  65
  66 static av_always_inline uint32_t pack16to32(int a, int b){
  67 #ifdef WORDS_BIGENDIAN
  68    return (b&0xFFFF) + (a<<16);
  69 #else
  70    return (a&0xFFFF) + (b<<16);
  71 #endif
  72 }
  73
  74 const uint8_t ff_rem6[52]={
  75 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  76 };
  77
  78 const uint8_t ff_div6[52]={
  79 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  80 };
  81
  82 static const int left_block_options[4][8]={
  83     {0,1,2,3,7,10,8,11},
  84     {2,2,3,3,8,11,8,11},
  85     {0,0,1,1,7,10,7,10},
  86     {0,2,0,2,7,10,7,10}
  87 };
  88
  89 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  90     MpegEncContext * const s = &h->s;
  91     const int mb_xy= h->mb_xy;
  92     int topleft_xy, top_xy, topright_xy, left_xy[2];
  93     int topleft_type, top_type, topright_type, left_type[2];
  94     int * left_block;
  95     int topleft_partition= -1;
  96     int i;
  97
  98     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  99
 100     //FIXME deblocking could skip the intra and nnz parts.
 101     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 102         return;
 103
 104     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 105      * stuff, I can't imagine that these complex rules are worth it. */
 106
 107     topleft_xy = top_xy - 1;
 108     topright_xy= top_xy + 1;
 109     left_xy[1] = left_xy[0] = mb_xy-1;
 110     left_block = left_block_options[0];
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block = left_block_options[1];
 150                 } else {
 151                     left_block= left_block_options[2];
 152                 }
 153             } else {
 154                 left_xy[1] += s->mb_stride;
 155                 left_block = left_block_options[3];
 156             }
 157         }
 158     }
 159
 160     h->top_mb_xy = top_xy;
 161     h->left_mb_xy[0] = left_xy[0];
 162     h->left_mb_xy[1] = left_xy[1];
 163     if(for_deblock){
 164         topleft_type = 0;
 165         topright_type = 0;
 166         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 167         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 168         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 169
 170         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 171             int list;
 172             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 173             for(i=0; i<16; i++)
 174                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 175             for(list=0; list<h->list_count; list++){
 176                 if(USES_LIST(mb_type,list)){
 177                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 178                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 179                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 180                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 181                         dst[0] = src[0];
 182                         dst[1] = src[1];
 183                         dst[2] = src[2];
 184                         dst[3] = src[3];
 185                     }
 186                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 188                     ref += h->b8_stride;
 189                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 190                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 191                 }else{
 192                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 193                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 194                 }
 195             }
 196         }
 197     }else{
 198         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 199         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 200         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 201         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 202         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 203     }
 204
 205     if(IS_INTRA(mb_type)){
 206         h->topleft_samples_available=
 207         h->top_samples_available=
 208         h->left_samples_available= 0xFFFF;
 209         h->topright_samples_available= 0xEEEA;
 210
 211         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 212             h->topleft_samples_available= 0xB3FF;
 213             h->top_samples_available= 0x33FF;
 214             h->topright_samples_available= 0x26EA;
 215         }
 216         for(i=0; i<2; i++){
 217             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 218                 h->topleft_samples_available&= 0xDF5F;
 219                 h->left_samples_available&= 0x5F5F;
 220             }
 221         }
 222
 223         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 224             h->topleft_samples_available&= 0x7FFF;
 225
 226         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 227             h->topright_samples_available&= 0xFBFF;
 228
 229         if(IS_INTRA4x4(mb_type)){
 230             if(IS_INTRA4x4(top_type)){
 231                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 232                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 233                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 234                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 235             }else{
 236                 int pred;
 237                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 238                     pred= -1;
 239                 else{
 240                     pred= 2;
 241                 }
 242                 h->intra4x4_pred_mode_cache[4+8*0]=
 243                 h->intra4x4_pred_mode_cache[5+8*0]=
 244                 h->intra4x4_pred_mode_cache[6+8*0]=
 245                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 246             }
 247             for(i=0; i<2; i++){
 248                 if(IS_INTRA4x4(left_type[i])){
 249                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 250                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 251                 }else{
 252                     int pred;
 253                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 254                         pred= -1;
 255                     else{
 256                         pred= 2;
 257                     }
 258                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 259                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 260                 }
 261             }
 262         }
 263     }
 264
 265
 266 /*
 267 0 . T T. T T T T
 268 1 L . .L . . . .
 269 2 L . .L . . . .
 270 3 . T TL . . . .
 271 4 L . .L . . . .
 272 5 L . .. . . . .
 273 */
 274 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 275     if(top_type){
 276         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 277         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 278         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 279         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 280
 281         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 282         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 283
 284         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 285         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 286
 287     }else{
 288         h->non_zero_count_cache[4+8*0]=
 289         h->non_zero_count_cache[5+8*0]=
 290         h->non_zero_count_cache[6+8*0]=
 291         h->non_zero_count_cache[7+8*0]=
 292
 293         h->non_zero_count_cache[1+8*0]=
 294         h->non_zero_count_cache[2+8*0]=
 295
 296         h->non_zero_count_cache[1+8*3]=
 297         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 298
 299     }
 300
 301     for (i=0; i<2; i++) {
 302         if(left_type[i]){
 303             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 304             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 305             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 306             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 307         }else{
 308             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 309             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 310             h->non_zero_count_cache[0+8*1 +   8*i]=
 311             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 312         }
 313     }
 314
 315     if( h->pps.cabac ) {
 316         // top_cbp
 317         if(top_type) {
 318             h->top_cbp = h->cbp_table[top_xy];
 319         } else if(IS_INTRA(mb_type)) {
 320             h->top_cbp = 0x1C0;
 321         } else {
 322             h->top_cbp = 0;
 323         }
 324         // left_cbp
 325         if (left_type[0]) {
 326             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 327         } else if(IS_INTRA(mb_type)) {
 328             h->left_cbp = 0x1C0;
 329         } else {
 330             h->left_cbp = 0;
 331         }
 332         if (left_type[0]) {
 333             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 334         }
 335         if (left_type[1]) {
 336             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 337         }
 338     }
 339
 340 #if 1
 341     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 342         int list;
 343         for(list=0; list<h->list_count; list++){
 344             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 345                 /*if(!h->mv_cache_clean[list]){
 346                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 347                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 348                     h->mv_cache_clean[list]= 1;
 349                 }*/
 350                 continue;
 351             }
 352             h->mv_cache_clean[list]= 0;
 353
 354             if(USES_LIST(top_type, list)){
 355                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 356                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 357                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 358                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 359                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 360                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 361                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 362                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 363                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 364                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 365             }else{
 366                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 367                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 368                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 369                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 370                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 371             }
 372
 373             for(i=0; i<2; i++){
 374                 int cache_idx = scan8[0] - 1 + i*2*8;
 375                 if(USES_LIST(left_type[i], list)){
 376                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 377                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 378                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 379                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 380                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 381                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 382                 }else{
 383                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 384                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 385                     h->ref_cache[list][cache_idx  ]=
 386                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 387                 }
 388             }
 389
 390             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 391                 continue;
 392
 393             if(USES_LIST(topleft_type, list)){
 394                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 395                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 396                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 397                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 398             }else{
 399                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 400                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 401             }
 402
 403             if(USES_LIST(topright_type, list)){
 404                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 405                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 406                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 407                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 408             }else{
 409                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 410                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411             }
 412
 413             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 414                 continue;
 415
 416             h->ref_cache[list][scan8[5 ]+1] =
 417             h->ref_cache[list][scan8[7 ]+1] =
 418             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 419             h->ref_cache[list][scan8[4 ]] =
 420             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 421             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 422             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 423             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 424             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 425             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 426
 427             if( h->pps.cabac ) {
 428                 /* XXX beurk, Load mvd */
 429                 if(USES_LIST(top_type, list)){
 430                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 431                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 432                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 433                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 434                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 435                 }else{
 436                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 437                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 438                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 439                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 440                 }
 441                 if(USES_LIST(left_type[0], list)){
 442                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 443                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 444                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 445                 }else{
 446                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 447                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 448                 }
 449                 if(USES_LIST(left_type[1], list)){
 450                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 456                 }
 457                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 458                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 459                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 460                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 461                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 462
 463                 if(h->slice_type_nos == FF_B_TYPE){
 464                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 465
 466                     if(IS_DIRECT(top_type)){
 467                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 468                     }else if(IS_8X8(top_type)){
 469                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 470                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 471                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 472                     }else{
 473                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 474                     }
 475
 476                     if(IS_DIRECT(left_type[0]))
 477                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 478                     else if(IS_8X8(left_type[0]))
 479                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 480                     else
 481                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 482
 483                     if(IS_DIRECT(left_type[1]))
 484                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 485                     else if(IS_8X8(left_type[1]))
 486                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 487                     else
 488                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 489                 }
 490             }
 491
 492             if(FRAME_MBAFF){
 493 #define MAP_MVS\
 494                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 495                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 496                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 497                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 498                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 499                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 500                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 501                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 502                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 503                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 504                 if(MB_FIELD){
 505 #define MAP_F2F(idx, mb_type)\
 506                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 507                         h->ref_cache[list][idx] <<= 1;\
 508                         h->mv_cache[list][idx][1] /= 2;\
 509                         h->mvd_cache[list][idx][1] /= 2;\
 510                     }
 511                     MAP_MVS
 512 #undef MAP_F2F
 513                 }else{
 514 #define MAP_F2F(idx, mb_type)\
 515                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 516                         h->ref_cache[list][idx] >>= 1;\
 517                         h->mv_cache[list][idx][1] <<= 1;\
 518                         h->mvd_cache[list][idx][1] <<= 1;\
 519                     }
 520                     MAP_MVS
 521 #undef MAP_F2F
 522                 }
 523             }
 524         }
 525     }
 526 #endif
 527
 528     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 529 }
 530
 531 static inline void write_back_intra_pred_mode(H264Context *h){
 532     const int mb_xy= h->mb_xy;
 533
 534     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 535     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 536     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 537     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 538     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 539     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 540     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 541 }
 542
 543 /**
 544  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 545  */
 546 static inline int check_intra4x4_pred_mode(H264Context *h){
 547     MpegEncContext * const s = &h->s;
 548     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 549     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 550     int i;
 551
 552     if(!(h->top_samples_available&0x8000)){
 553         for(i=0; i<4; i++){
 554             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 555             if(status<0){
 556                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 557                 return -1;
 558             } else if(status){
 559                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 560             }
 561         }
 562     }
 563
 564     if(!(h->left_samples_available&0x8000)){
 565         for(i=0; i<4; i++){
 566             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 567             if(status<0){
 568                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 569                 return -1;
 570             } else if(status){
 571                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 572             }
 573         }
 574     }
 575
 576     return 0;
 577 } //FIXME cleanup like next
 578
 579 /**
 580  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 581  */
 582 static inline int check_intra_pred_mode(H264Context *h, int mode){
 583     MpegEncContext * const s = &h->s;
 584     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 585     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 586
 587     if(mode > 6U) {
 588         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 589         return -1;
 590     }
 591
 592     if(!(h->top_samples_available&0x8000)){
 593         mode= top[ mode ];
 594         if(mode<0){
 595             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 596             return -1;
 597         }
 598     }
 599
 600     if(!(h->left_samples_available&0x8000)){
 601         mode= left[ mode ];
 602         if(mode<0){
 603             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 604             return -1;
 605         }
 606     }
 607
 608     return mode;
 609 }
 610
 611 /**
 612  * gets the predicted intra4x4 prediction mode.
 613  */
 614 static inline int pred_intra_mode(H264Context *h, int n){
 615     const int index8= scan8[n];
 616     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 617     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 618     const int min= FFMIN(left, top);
 619
 620     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 621
 622     if(min<0) return DC_PRED;
 623     else      return min;
 624 }
 625
 626 static inline void write_back_non_zero_count(H264Context *h){
 627     const int mb_xy= h->mb_xy;
 628
 629     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 630     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 631     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 632     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 633     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 634     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 635     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 636
 637     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 638     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 639     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 640
 641     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 642     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 643     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 644
 645     if(FRAME_MBAFF){
 646         // store all luma nnzs, for deblocking
 647         int v = 0, i;
 648         for(i=0; i<16; i++)
 649             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 650         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 651     }
 652 }
 653
 654 /**
 655  * gets the predicted number of non-zero coefficients.
 656  * @param n block index
 657  */
 658 static inline int pred_non_zero_count(H264Context *h, int n){
 659     const int index8= scan8[n];
 660     const int left= h->non_zero_count_cache[index8 - 1];
 661     const int top = h->non_zero_count_cache[index8 - 8];
 662     int i= left + top;
 663
 664     if(i<64) i= (i+1)>>1;
 665
 666     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 667
 668     return i&31;
 669 }
 670
 671 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 672     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 673     MpegEncContext *s = &h->s;
 674
 675     /* there is no consistent mapping of mvs to neighboring locations that will
 676      * make mbaff happy, so we can't move all this logic to fill_caches */
 677     if(FRAME_MBAFF){
 678         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 679         const int16_t *mv;
 680         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 681         *C = h->mv_cache[list][scan8[0]-2];
 682
 683         if(!MB_FIELD
 684            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 685             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 686             if(IS_INTERLACED(mb_types[topright_xy])){
 687 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 688                 const int x4 = X4, y4 = Y4;\
 689                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 690                 if(!USES_LIST(mb_type,list))\
 691                     return LIST_NOT_USED;\
 692                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 693                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 694                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 695                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 696
 697                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 698             }
 699         }
 700         if(topright_ref == PART_NOT_AVAILABLE
 701            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 702            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 703             if(!MB_FIELD
 704                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 705                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 706             }
 707             if(MB_FIELD
 708                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 709                && i >= scan8[0]+8){
 710                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 711                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 712             }
 713         }
 714 #undef SET_DIAG_MV
 715     }
 716
 717     if(topright_ref != PART_NOT_AVAILABLE){
 718         *C= h->mv_cache[list][ i - 8 + part_width ];
 719         return topright_ref;
 720     }else{
 721         tprintf(s->avctx, "topright MV not available\n");
 722
 723         *C= h->mv_cache[list][ i - 8 - 1 ];
 724         return h->ref_cache[list][ i - 8 - 1 ];
 725     }
 726 }
 727
 728 /**
 729  * gets the predicted MV.
 730  * @param n the block index
 731  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 732  * @param mx the x component of the predicted motion vector
 733  * @param my the y component of the predicted motion vector
 734  */
 735 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 736     const int index8= scan8[n];
 737     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 738     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 739     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 740     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 741     const int16_t * C;
 742     int diagonal_ref, match_count;
 743
 744     assert(part_width==1 || part_width==2 || part_width==4);
 745
 746 /* mv_cache
 747   B . . A T T T T
 748   U . . L . . , .
 749   U . . L . . . .
 750   U . . L . . , .
 751   . . . L . . . .
 752 */
 753
 754     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 755     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 756     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 757     if(match_count > 1){ //most common
 758         *mx= mid_pred(A[0], B[0], C[0]);
 759         *my= mid_pred(A[1], B[1], C[1]);
 760     }else if(match_count==1){
 761         if(left_ref==ref){
 762             *mx= A[0];
 763             *my= A[1];
 764         }else if(top_ref==ref){
 765             *mx= B[0];
 766             *my= B[1];
 767         }else{
 768             *mx= C[0];
 769             *my= C[1];
 770         }
 771     }else{
 772         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 773             *mx= A[0];
 774             *my= A[1];
 775         }else{
 776             *mx= mid_pred(A[0], B[0], C[0]);
 777             *my= mid_pred(A[1], B[1], C[1]);
 778         }
 779     }
 780
 781     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 782 }
 783
 784 /**
 785  * gets the directionally predicted 16x8 MV.
 786  * @param n the block index
 787  * @param mx the x component of the predicted motion vector
 788  * @param my the y component of the predicted motion vector
 789  */
 790 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 791     if(n==0){
 792         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 793         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 794
 795         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 796
 797         if(top_ref == ref){
 798             *mx= B[0];
 799             *my= B[1];
 800             return;
 801         }
 802     }else{
 803         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 804         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 805
 806         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 807
 808         if(left_ref == ref){
 809             *mx= A[0];
 810             *my= A[1];
 811             return;
 812         }
 813     }
 814
 815     //RARE
 816     pred_motion(h, n, 4, list, ref, mx, my);
 817 }
 818
 819 /**
 820  * gets the directionally predicted 8x16 MV.
 821  * @param n the block index
 822  * @param mx the x component of the predicted motion vector
 823  * @param my the y component of the predicted motion vector
 824  */
 825 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 826     if(n==0){
 827         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 828         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 829
 830         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 831
 832         if(left_ref == ref){
 833             *mx= A[0];
 834             *my= A[1];
 835             return;
 836         }
 837     }else{
 838         const int16_t * C;
 839         int diagonal_ref;
 840
 841         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 842
 843         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 844
 845         if(diagonal_ref == ref){
 846             *mx= C[0];
 847             *my= C[1];
 848             return;
 849         }
 850     }
 851
 852     //RARE
 853     pred_motion(h, n, 2, list, ref, mx, my);
 854 }
 855
 856 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 857     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 858     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 859
 860     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 861
 862     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 863        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 864        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 865
 866         *mx = *my = 0;
 867         return;
 868     }
 869
 870     pred_motion(h, 0, 4, 0, 0, mx, my);
 871
 872     return;
 873 }
 874
 875 static inline void direct_dist_scale_factor(H264Context * const h){
 876     MpegEncContext * const s = &h->s;
 877     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 878     const int poc1 = h->ref_list[1][0].poc;
 879     int i;
 880     for(i=0; i<h->ref_count[0]; i++){
 881         int poc0 = h->ref_list[0][i].poc;
 882         int td = av_clip(poc1 - poc0, -128, 127);
 883         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 884             h->dist_scale_factor[i] = 256;
 885         }else{
 886             int tb = av_clip(poc - poc0, -128, 127);
 887             int tx = (16384 + (FFABS(td) >> 1)) / td;
 888             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 889         }
 890     }
 891     if(FRAME_MBAFF){
 892         for(i=0; i<h->ref_count[0]; i++){
 893             h->dist_scale_factor_field[2*i] =
 894             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 895         }
 896     }
 897 }
 898 static inline void direct_ref_list_init(H264Context * const h){
 899     MpegEncContext * const s = &h->s;
 900     Picture * const ref1 = &h->ref_list[1][0];
 901     Picture * const cur = s->current_picture_ptr;
 902     int list, i, j;
 903     int sidx= s->picture_structure&1;
 904     if(cur->pict_type == FF_I_TYPE)
 905         cur->ref_count[sidx][0] = 0;
 906     if(cur->pict_type != FF_B_TYPE)
 907         cur->ref_count[sidx][1] = 0;
 908     for(list=0; list<2; list++){
 909         cur->ref_count[sidx][list] = h->ref_count[list];
 910         for(j=0; j<h->ref_count[list]; j++)
 911             cur->ref_poc[sidx][list][j] = h->ref_list[list][j].poc;
 912     }
 913     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 914         return;
 915     for(list=0; list<2; list++){
 916         for(i=0; i<ref1->ref_count[sidx][list]; i++){
 917             const int poc = ref1->ref_poc[sidx][list][i];
 918             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 919             for(j=0; j<h->ref_count[list]; j++)
 920                 if(h->ref_list[list][j].poc == poc){
 921                     h->map_col_to_list0[list][i] = j;
 922                     break;
 923                 }
 924         }
 925     }
 926     if(FRAME_MBAFF){
 927         for(list=0; list<2; list++){
 928             for(i=0; i<ref1->ref_count[sidx][list]; i++){
 929                 j = h->map_col_to_list0[list][i];
 930                 h->map_col_to_list0_field[list][2*i] = 2*j;
 931                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 932             }
 933         }
 934     }
 935 }
 936
 937 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 938     MpegEncContext * const s = &h->s;
 939     const int fieldoff= (s->picture_structure & h->ref_list[1][0].reference) ? 0 : (3-2*s->picture_structure);
 940     const int mb_xy =   h->mb_xy + s->mb_stride*fieldoff;
 941     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride + 2*h->b8_stride*fieldoff;
 942     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h-> b_stride + 4*h-> b_stride*fieldoff;
 943     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 944     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 945     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 946     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 947     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 948     const int is_b8x8 = IS_8X8(*mb_type);
 949     unsigned int sub_mb_type;
 950     int i8, i4;
 951
 952 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 953     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 954         /* FIXME save sub mb types from previous frames (or derive from MVs)
 955          * so we know exactly what block size to use */
 956         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 957         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 958     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 959         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 960         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 961     }else{
 962         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 963         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 964     }
 965     if(!is_b8x8)
 966         *mb_type |= MB_TYPE_DIRECT2;
 967     if(MB_FIELD)
 968         *mb_type |= MB_TYPE_INTERLACED;
 969
 970     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 971
 972     if(h->direct_spatial_mv_pred){
 973         int ref[2];
 974         int mv[2][2];
 975         int list;
 976
 977         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 978
 979         /* ref = min(neighbors) */
 980         for(list=0; list<2; list++){
 981             int refa = h->ref_cache[list][scan8[0] - 1];
 982             int refb = h->ref_cache[list][scan8[0] - 8];
 983             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
 984             if(refc == PART_NOT_AVAILABLE)
 985                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
 986             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
 987             if(ref[list] < 0)
 988                 ref[list] = -1;
 989         }
 990
 991         if(ref[0] < 0 && ref[1] < 0){
 992             ref[0] = ref[1] = 0;
 993             mv[0][0] = mv[0][1] =
 994             mv[1][0] = mv[1][1] = 0;
 995         }else{
 996             for(list=0; list<2; list++){
 997                 if(ref[list] >= 0)
 998                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
 999                 else
1000                     mv[list][0] = mv[list][1] = 0;
1001             }
1002         }
1003
1004         if(ref[1] < 0){
1005             if(!is_b8x8)
1006                 *mb_type &= ~MB_TYPE_L1;
1007             sub_mb_type &= ~MB_TYPE_L1;
1008         }else if(ref[0] < 0){
1009             if(!is_b8x8)
1010                 *mb_type &= ~MB_TYPE_L0;
1011             sub_mb_type &= ~MB_TYPE_L0;
1012         }
1013
1014         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1015             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1016             int mb_types_col[2];
1017             int b8_stride = h->b8_stride;
1018             int b4_stride = h->b_stride;
1019
1020             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1021
1022             if(IS_INTERLACED(*mb_type)){
1023                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1024                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1025                 if(s->mb_y&1){
1026                     l1ref0 -= 2*b8_stride;
1027                     l1ref1 -= 2*b8_stride;
1028                     l1mv0 -= 4*b4_stride;
1029                     l1mv1 -= 4*b4_stride;
1030                 }
1031                 b8_stride *= 3;
1032                 b4_stride *= 6;
1033             }else{
1034                 int cur_poc = s->current_picture_ptr->poc;
1035                 int *col_poc = h->ref_list[1]->field_poc;
1036                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1037                 int dy = 2*col_parity - (s->mb_y&1);
1038                 mb_types_col[0] =
1039                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1040                 l1ref0 += dy*b8_stride;
1041                 l1ref1 += dy*b8_stride;
1042                 l1mv0 += 2*dy*b4_stride;
1043                 l1mv1 += 2*dy*b4_stride;
1044                 b8_stride = 0;
1045             }
1046
1047             for(i8=0; i8<4; i8++){
1048                 int x8 = i8&1;
1049                 int y8 = i8>>1;
1050                 int xy8 = x8+y8*b8_stride;
1051                 int xy4 = 3*x8+y8*b4_stride;
1052                 int a=0, b=0;
1053
1054                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1055                     continue;
1056                 h->sub_mb_type[i8] = sub_mb_type;
1057
1058                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1059                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1060                 if(!IS_INTRA(mb_types_col[y8])
1061                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1062                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1063                     if(ref[0] > 0)
1064                         a= pack16to32(mv[0][0],mv[0][1]);
1065                     if(ref[1] > 0)
1066                         b= pack16to32(mv[1][0],mv[1][1]);
1067                 }else{
1068                     a= pack16to32(mv[0][0],mv[0][1]);
1069                     b= pack16to32(mv[1][0],mv[1][1]);
1070                 }
1071                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1072                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1073             }
1074         }else if(IS_16X16(*mb_type)){
1075             int a=0, b=0;
1076
1077             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1078             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1079             if(!IS_INTRA(mb_type_col)
1080                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1081                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1082                        && (h->x264_build>33 || !h->x264_build)))){
1083                 if(ref[0] > 0)
1084                     a= pack16to32(mv[0][0],mv[0][1]);
1085                 if(ref[1] > 0)
1086                     b= pack16to32(mv[1][0],mv[1][1]);
1087             }else{
1088                 a= pack16to32(mv[0][0],mv[0][1]);
1089                 b= pack16to32(mv[1][0],mv[1][1]);
1090             }
1091             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1092             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1093         }else{
1094             for(i8=0; i8<4; i8++){
1095                 const int x8 = i8&1;
1096                 const int y8 = i8>>1;
1097
1098                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1099                     continue;
1100                 h->sub_mb_type[i8] = sub_mb_type;
1101
1102                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1103                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1104                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1105                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1106
1107                 /* col_zero_flag */
1108                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1109                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1110                                                   && (h->x264_build>33 || !h->x264_build)))){
1111                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1112                     if(IS_SUB_8X8(sub_mb_type)){
1113                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1114                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1115                             if(ref[0] == 0)
1116                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1117                             if(ref[1] == 0)
1118                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1119                         }
1120                     }else
1121                     for(i4=0; i4<4; i4++){
1122                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1123                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1124                             if(ref[0] == 0)
1125                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1126                             if(ref[1] == 0)
1127                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1128                         }
1129                     }
1130                 }
1131             }
1132         }
1133     }else{ /* direct temporal mv pred */
1134         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1135         const int *dist_scale_factor = h->dist_scale_factor;
1136
1137         if(FRAME_MBAFF){
1138             if(IS_INTERLACED(*mb_type)){
1139                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1140                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1141                 dist_scale_factor = h->dist_scale_factor_field;
1142             }
1143             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1144                 /* FIXME assumes direct_8x8_inference == 1 */
1145                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1146                 int mb_types_col[2];
1147                 int y_shift;
1148
1149                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1150                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1151                          | (*mb_type & MB_TYPE_INTERLACED);
1152                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1153
1154                 if(IS_INTERLACED(*mb_type)){
1155                     /* frame to field scaling */
1156                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1157                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1158                     if(s->mb_y&1){
1159                         l1ref0 -= 2*h->b8_stride;
1160                         l1ref1 -= 2*h->b8_stride;
1161                         l1mv0 -= 4*h->b_stride;
1162                         l1mv1 -= 4*h->b_stride;
1163                     }
1164                     y_shift = 0;
1165
1166                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1167                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1168                        && !is_b8x8)
1169                         *mb_type |= MB_TYPE_16x8;
1170                     else
1171                         *mb_type |= MB_TYPE_8x8;
1172                 }else{
1173                     /* field to frame scaling */
1174                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1175                      * but in MBAFF, top and bottom POC are equal */
1176                     int dy = (s->mb_y&1) ? 1 : 2;
1177                     mb_types_col[0] =
1178                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1179                     l1ref0 += dy*h->b8_stride;
1180                     l1ref1 += dy*h->b8_stride;
1181                     l1mv0 += 2*dy*h->b_stride;
1182                     l1mv1 += 2*dy*h->b_stride;
1183                     y_shift = 2;
1184
1185                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1186                        && !is_b8x8)
1187                         *mb_type |= MB_TYPE_16x16;
1188                     else
1189                         *mb_type |= MB_TYPE_8x8;
1190                 }
1191
1192                 for(i8=0; i8<4; i8++){
1193                     const int x8 = i8&1;
1194                     const int y8 = i8>>1;
1195                     int ref0, scale;
1196                     const int16_t (*l1mv)[2]= l1mv0;
1197
1198                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1199                         continue;
1200                     h->sub_mb_type[i8] = sub_mb_type;
1201
1202                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1203                     if(IS_INTRA(mb_types_col[y8])){
1204                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1205                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1206                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1207                         continue;
1208                     }
1209
1210                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1211                     if(ref0 >= 0)
1212                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1213                     else{
1214                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1215                         l1mv= l1mv1;
1216                     }
1217                     scale = dist_scale_factor[ref0];
1218                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1219
1220                     {
1221                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1222                         int my_col = (mv_col[1]<<y_shift)/2;
1223                         int mx = (scale * mv_col[0] + 128) >> 8;
1224                         int my = (scale * my_col + 128) >> 8;
1225                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1226                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1227                     }
1228                 }
1229                 return;
1230             }
1231         }
1232
1233         /* one-to-one mv scaling */
1234
1235         if(IS_16X16(*mb_type)){
1236             int ref, mv0, mv1;
1237
1238             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1239             if(IS_INTRA(mb_type_col)){
1240                 ref=mv0=mv1=0;
1241             }else{
1242                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1243                                                 : map_col_to_list0[1][l1ref1[0]];
1244                 const int scale = dist_scale_factor[ref0];
1245                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1246                 int mv_l0[2];
1247                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1248                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1249                 ref= ref0;
1250                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1251                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1252             }
1253             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1254             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1255             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1256         }else{
1257             for(i8=0; i8<4; i8++){
1258                 const int x8 = i8&1;
1259                 const int y8 = i8>>1;
1260                 int ref0, scale;
1261                 const int16_t (*l1mv)[2]= l1mv0;
1262
1263                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1264                     continue;
1265                 h->sub_mb_type[i8] = sub_mb_type;
1266                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1267                 if(IS_INTRA(mb_type_col)){
1268                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1269                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1270                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1271                     continue;
1272                 }
1273
1274                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1275                 if(ref0 >= 0)
1276                     ref0 = map_col_to_list0[0][ref0];
1277                 else{
1278                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1279                     l1mv= l1mv1;
1280                 }
1281                 scale = dist_scale_factor[ref0];
1282
1283                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1284                 if(IS_SUB_8X8(sub_mb_type)){
1285                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1286                     int mx = (scale * mv_col[0] + 128) >> 8;
1287                     int my = (scale * mv_col[1] + 128) >> 8;
1288                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1289                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1290                 }else
1291                 for(i4=0; i4<4; i4++){
1292                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1293                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1294                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1295                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1296                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1297                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1298                 }
1299             }
1300         }
1301     }
1302 }
1303
1304 static inline void write_back_motion(H264Context *h, int mb_type){
1305     MpegEncContext * const s = &h->s;
1306     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1307     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1308     int list;
1309
1310     if(!USES_LIST(mb_type, 0))
1311         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1312
1313     for(list=0; list<h->list_count; list++){
1314         int y;
1315         if(!USES_LIST(mb_type, list))
1316             continue;
1317
1318         for(y=0; y<4; y++){
1319             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1320             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1321         }
1322         if( h->pps.cabac ) {
1323             if(IS_SKIP(mb_type))
1324                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1325             else
1326             for(y=0; y<4; y++){
1327                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1328                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1329             }
1330         }
1331
1332         {
1333             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1334             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1335             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1336             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1337             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1338         }
1339     }
1340
1341     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1342         if(IS_8X8(mb_type)){
1343             uint8_t *direct_table = &h->direct_table[b8_xy];
1344             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1345             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1346             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1347         }
1348     }
1349 }
1350
1351 /**
1352  * Decodes a network abstraction layer unit.
1353  * @param consumed is the number of bytes used as input
1354  * @param length is the length of the array
1355  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1356  * @returns decoded bytes, might be src+1 if no escapes
1357  */
1358 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1359     int i, si, di;
1360     uint8_t *dst;
1361     int bufidx;
1362
1363 //    src[0]&0x80;                //forbidden bit
1364     h->nal_ref_idc= src[0]>>5;
1365     h->nal_unit_type= src[0]&0x1F;
1366
1367     src++; length--;
1368 #if 0
1369     for(i=0; i<length; i++)
1370         printf("%2X ", src[i]);
1371 #endif
1372     for(i=0; i+1<length; i+=2){
1373         if(src[i]) continue;
1374         if(i>0 && src[i-1]==0) i--;
1375         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1376             if(src[i+2]!=3){
1377                 /* startcode, so we must be past the end */
1378                 length=i;
1379             }
1380             break;
1381         }
1382     }
1383
1384     if(i>=length-1){ //no escaped 0
1385         *dst_length= length;
1386         *consumed= length+1; //+1 for the header
1387         return src;
1388     }
1389
1390     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1391     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1392     dst= h->rbsp_buffer[bufidx];
1393
1394     if (dst == NULL){
1395         return NULL;
1396     }
1397
1398 //printf("decoding esc\n");
1399     si=di=0;
1400     while(si<length){
1401         //remove escapes (very rare 1:2^22)
1402         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1403             if(src[si+2]==3){ //escape
1404                 dst[di++]= 0;
1405                 dst[di++]= 0;
1406                 si+=3;
1407                 continue;
1408             }else //next start code
1409                 break;
1410         }
1411
1412         dst[di++]= src[si++];
1413     }
1414
1415     *dst_length= di;
1416     *consumed= si + 1;//+1 for the header
1417 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1418     return dst;
1419 }
1420
1421 /**
1422  * identifies the exact end of the bitstream
1423  * @return the length of the trailing, or 0 if damaged
1424  */
1425 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1426     int v= *src;
1427     int r;
1428
1429     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1430
1431     for(r=1; r<9; r++){
1432         if(v&1) return r;
1433         v>>=1;
1434     }
1435     return 0;
1436 }
1437
1438 /**
1439  * IDCT transforms the 16 dc values and dequantizes them.
1440  * @param qp quantization parameter
1441  */
1442 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1443 #define stride 16
1444     int i;
1445     int temp[16]; //FIXME check if this is a good idea
1446     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1447     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1448
1449 //memset(block, 64, 2*256);
1450 //return;
1451     for(i=0; i<4; i++){
1452         const int offset= y_offset[i];
1453         const int z0= block[offset+stride*0] + block[offset+stride*4];
1454         const int z1= block[offset+stride*0] - block[offset+stride*4];
1455         const int z2= block[offset+stride*1] - block[offset+stride*5];
1456         const int z3= block[offset+stride*1] + block[offset+stride*5];
1457
1458         temp[4*i+0]= z0+z3;
1459         temp[4*i+1]= z1+z2;
1460         temp[4*i+2]= z1-z2;
1461         temp[4*i+3]= z0-z3;
1462     }
1463
1464     for(i=0; i<4; i++){
1465         const int offset= x_offset[i];
1466         const int z0= temp[4*0+i] + temp[4*2+i];
1467         const int z1= temp[4*0+i] - temp[4*2+i];
1468         const int z2= temp[4*1+i] - temp[4*3+i];
1469         const int z3= temp[4*1+i] + temp[4*3+i];
1470
1471         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1472         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1473         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1474         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1475     }
1476 }
1477
1478 #if 0
1479 /**
1480  * DCT transforms the 16 dc values.
1481  * @param qp quantization parameter ??? FIXME
1482  */
1483 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1484 //    const int qmul= dequant_coeff[qp][0];
1485     int i;
1486     int temp[16]; //FIXME check if this is a good idea
1487     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1488     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1489
1490     for(i=0; i<4; i++){
1491         const int offset= y_offset[i];
1492         const int z0= block[offset+stride*0] + block[offset+stride*4];
1493         const int z1= block[offset+stride*0] - block[offset+stride*4];
1494         const int z2= block[offset+stride*1] - block[offset+stride*5];
1495         const int z3= block[offset+stride*1] + block[offset+stride*5];
1496
1497         temp[4*i+0]= z0+z3;
1498         temp[4*i+1]= z1+z2;
1499         temp[4*i+2]= z1-z2;
1500         temp[4*i+3]= z0-z3;
1501     }
1502
1503     for(i=0; i<4; i++){
1504         const int offset= x_offset[i];
1505         const int z0= temp[4*0+i] + temp[4*2+i];
1506         const int z1= temp[4*0+i] - temp[4*2+i];
1507         const int z2= temp[4*1+i] - temp[4*3+i];
1508         const int z3= temp[4*1+i] + temp[4*3+i];
1509
1510         block[stride*0 +offset]= (z0 + z3)>>1;
1511         block[stride*2 +offset]= (z1 + z2)>>1;
1512         block[stride*8 +offset]= (z1 - z2)>>1;
1513         block[stride*10+offset]= (z0 - z3)>>1;
1514     }
1515 }
1516 #endif
1517
1518 #undef xStride
1519 #undef stride
1520
1521 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1522     const int stride= 16*2;
1523     const int xStride= 16;
1524     int a,b,c,d,e;
1525
1526     a= block[stride*0 + xStride*0];
1527     b= block[stride*0 + xStride*1];
1528     c= block[stride*1 + xStride*0];
1529     d= block[stride*1 + xStride*1];
1530
1531     e= a-b;
1532     a= a+b;
1533     b= c-d;
1534     c= c+d;
1535
1536     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1537     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1538     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1539     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1540 }
1541
1542 #if 0
1543 static void chroma_dc_dct_c(DCTELEM *block){
1544     const int stride= 16*2;
1545     const int xStride= 16;
1546     int a,b,c,d,e;
1547
1548     a= block[stride*0 + xStride*0];
1549     b= block[stride*0 + xStride*1];
1550     c= block[stride*1 + xStride*0];
1551     d= block[stride*1 + xStride*1];
1552
1553     e= a-b;
1554     a= a+b;
1555     b= c-d;
1556     c= c+d;
1557
1558     block[stride*0 + xStride*0]= (a+c);
1559     block[stride*0 + xStride*1]= (e+b);
1560     block[stride*1 + xStride*0]= (a-c);
1561     block[stride*1 + xStride*1]= (e-b);
1562 }
1563 #endif
1564
1565 /**
1566  * gets the chroma qp.
1567  */
1568 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1569     return h->pps.chroma_qp_table[t][qscale];
1570 }
1571
1572 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1573 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1574 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1575     int i;
1576     const int * const quant_table= quant_coeff[qscale];
1577     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1578     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1579     const unsigned int threshold2= (threshold1<<1);
1580     int last_non_zero;
1581
1582     if(separate_dc){
1583         if(qscale<=18){
1584             //avoid overflows
1585             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1586             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1587             const unsigned int dc_threshold2= (dc_threshold1<<1);
1588
1589             int level= block[0]*quant_coeff[qscale+18][0];
1590             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1591                 if(level>0){
1592                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1593                     block[0]= level;
1594                 }else{
1595                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1596                     block[0]= -level;
1597                 }
1598 //                last_non_zero = i;
1599             }else{
1600                 block[0]=0;
1601             }
1602         }else{
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_table[0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }
1621         last_non_zero= 0;
1622         i=1;
1623     }else{
1624         last_non_zero= -1;
1625         i=0;
1626     }
1627
1628     for(; i<16; i++){
1629         const int j= scantable[i];
1630         int level= block[j]*quant_table[j];
1631
1632 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1633 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1634         if(((unsigned)(level+threshold1))>threshold2){
1635             if(level>0){
1636                 level= (bias + level)>>QUANT_SHIFT;
1637                 block[j]= level;
1638             }else{
1639                 level= (bias - level)>>QUANT_SHIFT;
1640                 block[j]= -level;
1641             }
1642             last_non_zero = i;
1643         }else{
1644             block[j]=0;
1645         }
1646     }
1647
1648     return last_non_zero;
1649 }
1650
1651 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1652                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1653                            int src_x_offset, int src_y_offset,
1654                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1655     MpegEncContext * const s = &h->s;
1656     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1657     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1658     const int luma_xy= (mx&3) + ((my&3)<<2);
1659     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1660     uint8_t * src_cb, * src_cr;
1661     int extra_width= h->emu_edge_width;
1662     int extra_height= h->emu_edge_height;
1663     int emu=0;
1664     const int full_mx= mx>>2;
1665     const int full_my= my>>2;
1666     const int pic_width  = 16*s->mb_width;
1667     const int pic_height = 16*s->mb_height >> MB_FIELD;
1668
1669     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1670         return;
1671
1672     if(mx&7) extra_width -= 3;
1673     if(my&7) extra_height -= 3;
1674
1675     if(   full_mx < 0-extra_width
1676        || full_my < 0-extra_height
1677        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1678        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1679         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1680             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1681         emu=1;
1682     }
1683
1684     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1685     if(!square){
1686         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1687     }
1688
1689     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1690
1691     if(MB_FIELD){
1692         // chroma offset when predicting from a field of opposite parity
1693         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1694         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1695     }
1696     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1697     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1698
1699     if(emu){
1700         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1701             src_cb= s->edge_emu_buffer;
1702     }
1703     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1704
1705     if(emu){
1706         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1707             src_cr= s->edge_emu_buffer;
1708     }
1709     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1710 }
1711
1712 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1713                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1714                            int x_offset, int y_offset,
1715                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1716                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1717                            int list0, int list1){
1718     MpegEncContext * const s = &h->s;
1719     qpel_mc_func *qpix_op=  qpix_put;
1720     h264_chroma_mc_func chroma_op= chroma_put;
1721
1722     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1723     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1724     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1725     x_offset += 8*s->mb_x;
1726     y_offset += 8*(s->mb_y >> MB_FIELD);
1727
1728     if(list0){
1729         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1730         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1731                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1732                            qpix_op, chroma_op);
1733
1734         qpix_op=  qpix_avg;
1735         chroma_op= chroma_avg;
1736     }
1737
1738     if(list1){
1739         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1740         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1741                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1742                            qpix_op, chroma_op);
1743     }
1744 }
1745
1746 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1747                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1748                            int x_offset, int y_offset,
1749                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1750                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1751                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1752                            int list0, int list1){
1753     MpegEncContext * const s = &h->s;
1754
1755     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1756     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1757     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1758     x_offset += 8*s->mb_x;
1759     y_offset += 8*(s->mb_y >> MB_FIELD);
1760
1761     if(list0 && list1){
1762         /* don't optimize for luma-only case, since B-frames usually
1763          * use implicit weights => chroma too. */
1764         uint8_t *tmp_cb = s->obmc_scratchpad;
1765         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1766         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1767         int refn0 = h->ref_cache[0][ scan8[n] ];
1768         int refn1 = h->ref_cache[1][ scan8[n] ];
1769
1770         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1771                     dest_y, dest_cb, dest_cr,
1772                     x_offset, y_offset, qpix_put, chroma_put);
1773         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1774                     tmp_y, tmp_cb, tmp_cr,
1775                     x_offset, y_offset, qpix_put, chroma_put);
1776
1777         if(h->use_weight == 2){
1778             int weight0 = h->implicit_weight[refn0][refn1];
1779             int weight1 = 64 - weight0;
1780             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1781             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1782             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1783         }else{
1784             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1785                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1786                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1787             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1788                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1789                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1790             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1791                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1792                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1793         }
1794     }else{
1795         int list = list1 ? 1 : 0;
1796         int refn = h->ref_cache[list][ scan8[n] ];
1797         Picture *ref= &h->ref_list[list][refn];
1798         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1799                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1800                     qpix_put, chroma_put);
1801
1802         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1804         if(h->use_weight_chroma){
1805             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1807             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1808                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1809         }
1810     }
1811 }
1812
1813 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1814                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1815                            int x_offset, int y_offset,
1816                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1817                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1818                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1819                            int list0, int list1){
1820     if((h->use_weight==2 && list0 && list1
1821         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1822        || h->use_weight==1)
1823         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1824                          x_offset, y_offset, qpix_put, chroma_put,
1825                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1826     else
1827         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1828                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1829 }
1830
1831 static inline void prefetch_motion(H264Context *h, int list){
1832     /* fetch pixels for estimated mv 4 macroblocks ahead
1833      * optimized for 64byte cache lines */
1834     MpegEncContext * const s = &h->s;
1835     const int refn = h->ref_cache[list][scan8[0]];
1836     if(refn >= 0){
1837         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1838         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1839         uint8_t **src= h->ref_list[list][refn].data;
1840         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1841         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1842         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1843         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1844     }
1845 }
1846
1847 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1848                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1849                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1850                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1851     MpegEncContext * const s = &h->s;
1852     const int mb_xy= h->mb_xy;
1853     const int mb_type= s->current_picture.mb_type[mb_xy];
1854
1855     assert(IS_INTER(mb_type));
1856
1857     prefetch_motion(h, 0);
1858
1859     if(IS_16X16(mb_type)){
1860         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1861                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1862                 &weight_op[0], &weight_avg[0],
1863                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1864     }else if(IS_16X8(mb_type)){
1865         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1866                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1867                 &weight_op[1], &weight_avg[1],
1868                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1869         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1870                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1871                 &weight_op[1], &weight_avg[1],
1872                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1873     }else if(IS_8X16(mb_type)){
1874         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1875                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1876                 &weight_op[2], &weight_avg[2],
1877                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1878         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1879                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1880                 &weight_op[2], &weight_avg[2],
1881                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1882     }else{
1883         int i;
1884
1885         assert(IS_8X8(mb_type));
1886
1887         for(i=0; i<4; i++){
1888             const int sub_mb_type= h->sub_mb_type[i];
1889             const int n= 4*i;
1890             int x_offset= (i&1)<<2;
1891             int y_offset= (i&2)<<1;
1892
1893             if(IS_SUB_8X8(sub_mb_type)){
1894                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1895                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1896                     &weight_op[3], &weight_avg[3],
1897                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1898             }else if(IS_SUB_8X4(sub_mb_type)){
1899                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1900                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1901                     &weight_op[4], &weight_avg[4],
1902                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1903                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1904                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1905                     &weight_op[4], &weight_avg[4],
1906                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1907             }else if(IS_SUB_4X8(sub_mb_type)){
1908                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1909                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1910                     &weight_op[5], &weight_avg[5],
1911                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1912                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1913                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1914                     &weight_op[5], &weight_avg[5],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else{
1917                 int j;
1918                 assert(IS_SUB_4X4(sub_mb_type));
1919                 for(j=0; j<4; j++){
1920                     int sub_x_offset= x_offset + 2*(j&1);
1921                     int sub_y_offset= y_offset +   (j&2);
1922                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1923                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1924                         &weight_op[6], &weight_avg[6],
1925                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1926                 }
1927             }
1928         }
1929     }
1930
1931     prefetch_motion(h, 1);
1932 }
1933
1934 static av_cold void decode_init_vlc(void){
1935     static int done = 0;
1936
1937     if (!done) {
1938         int i;
1939         done = 1;
1940
1941         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1942                  &chroma_dc_coeff_token_len [0], 1, 1,
1943                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1944
1945         for(i=0; i<4; i++){
1946             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1947                      &coeff_token_len [i][0], 1, 1,
1948                      &coeff_token_bits[i][0], 1, 1, 1);
1949         }
1950
1951         for(i=0; i<3; i++){
1952             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1953                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1954                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1955         }
1956         for(i=0; i<15; i++){
1957             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1958                      &total_zeros_len [i][0], 1, 1,
1959                      &total_zeros_bits[i][0], 1, 1, 1);
1960         }
1961
1962         for(i=0; i<6; i++){
1963             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1964                      &run_len [i][0], 1, 1,
1965                      &run_bits[i][0], 1, 1, 1);
1966         }
1967         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1968                  &run_len [6][0], 1, 1,
1969                  &run_bits[6][0], 1, 1, 1);
1970     }
1971 }
1972
1973 static void free_tables(H264Context *h){
1974     int i;
1975     H264Context *hx;
1976     av_freep(&h->intra4x4_pred_mode);
1977     av_freep(&h->chroma_pred_mode_table);
1978     av_freep(&h->cbp_table);
1979     av_freep(&h->mvd_table[0]);
1980     av_freep(&h->mvd_table[1]);
1981     av_freep(&h->direct_table);
1982     av_freep(&h->non_zero_count);
1983     av_freep(&h->slice_table_base);
1984     h->slice_table= NULL;
1985
1986     av_freep(&h->mb2b_xy);
1987     av_freep(&h->mb2b8_xy);
1988
1989     for(i = 0; i < MAX_SPS_COUNT; i++)
1990         av_freep(h->sps_buffers + i);
1991
1992     for(i = 0; i < MAX_PPS_COUNT; i++)
1993         av_freep(h->pps_buffers + i);
1994
1995     for(i = 0; i < h->s.avctx->thread_count; i++) {
1996         hx = h->thread_context[i];
1997         if(!hx) continue;
1998         av_freep(&hx->top_borders[1]);
1999         av_freep(&hx->top_borders[0]);
2000         av_freep(&hx->s.obmc_scratchpad);
2001     }
2002 }
2003
2004 static void init_dequant8_coeff_table(H264Context *h){
2005     int i,q,x;
2006     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2007     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2008     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2009
2010     for(i=0; i<2; i++ ){
2011         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2012             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2013             break;
2014         }
2015
2016         for(q=0; q<52; q++){
2017             int shift = ff_div6[q];
2018             int idx = ff_rem6[q];
2019             for(x=0; x<64; x++)
2020                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2021                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2022                     h->pps.scaling_matrix8[i][x]) << shift;
2023         }
2024     }
2025 }
2026
2027 static void init_dequant4_coeff_table(H264Context *h){
2028     int i,j,q,x;
2029     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2030     for(i=0; i<6; i++ ){
2031         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2032         for(j=0; j<i; j++){
2033             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2034                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2035                 break;
2036             }
2037         }
2038         if(j<i)
2039             continue;
2040
2041         for(q=0; q<52; q++){
2042             int shift = ff_div6[q] + 2;
2043             int idx = ff_rem6[q];
2044             for(x=0; x<16; x++)
2045                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2046                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2047                     h->pps.scaling_matrix4[i][x]) << shift;
2048         }
2049     }
2050 }
2051
2052 static void init_dequant_tables(H264Context *h){
2053     int i,x;
2054     init_dequant4_coeff_table(h);
2055     if(h->pps.transform_8x8_mode)
2056         init_dequant8_coeff_table(h);
2057     if(h->sps.transform_bypass){
2058         for(i=0; i<6; i++)
2059             for(x=0; x<16; x++)
2060                 h->dequant4_coeff[i][0][x] = 1<<6;
2061         if(h->pps.transform_8x8_mode)
2062             for(i=0; i<2; i++)
2063                 for(x=0; x<64; x++)
2064                     h->dequant8_coeff[i][0][x] = 1<<6;
2065     }
2066 }
2067
2068
2069 /**
2070  * allocates tables.
2071  * needs width/height
2072  */
2073 static int alloc_tables(H264Context *h){
2074     MpegEncContext * const s = &h->s;
2075     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2076     int x,y;
2077
2078     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2079
2080     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2081     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2082     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2083
2084     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2085     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2087     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2088
2089     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2090     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2091
2092     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2093     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2094     for(y=0; y<s->mb_height; y++){
2095         for(x=0; x<s->mb_width; x++){
2096             const int mb_xy= x + y*s->mb_stride;
2097             const int b_xy = 4*x + 4*y*h->b_stride;
2098             const int b8_xy= 2*x + 2*y*h->b8_stride;
2099
2100             h->mb2b_xy [mb_xy]= b_xy;
2101             h->mb2b8_xy[mb_xy]= b8_xy;
2102         }
2103     }
2104
2105     s->obmc_scratchpad = NULL;
2106
2107     if(!h->dequant4_coeff[0])
2108         init_dequant_tables(h);
2109
2110     return 0;
2111 fail:
2112     free_tables(h);
2113     return -1;
2114 }
2115
2116 /**
2117  * Mimic alloc_tables(), but for every context thread.
2118  */
2119 static void clone_tables(H264Context *dst, H264Context *src){
2120     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2121     dst->non_zero_count           = src->non_zero_count;
2122     dst->slice_table              = src->slice_table;
2123     dst->cbp_table                = src->cbp_table;
2124     dst->mb2b_xy                  = src->mb2b_xy;
2125     dst->mb2b8_xy                 = src->mb2b8_xy;
2126     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2127     dst->mvd_table[0]             = src->mvd_table[0];
2128     dst->mvd_table[1]             = src->mvd_table[1];
2129     dst->direct_table             = src->direct_table;
2130
2131     dst->s.obmc_scratchpad = NULL;
2132     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2133 }
2134
2135 /**
2136  * Init context
2137  * Allocate buffers which are not shared amongst multiple threads.
2138  */
2139 static int context_init(H264Context *h){
2140     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2142
2143     return 0;
2144 fail:
2145     return -1; // free_tables will clean up for us
2146 }
2147
2148 static av_cold void common_init(H264Context *h){
2149     MpegEncContext * const s = &h->s;
2150
2151     s->width = s->avctx->width;
2152     s->height = s->avctx->height;
2153     s->codec_id= s->avctx->codec->id;
2154
2155     ff_h264_pred_init(&h->hpc, s->codec_id);
2156
2157     h->dequant_coeff_pps= -1;
2158     s->unrestricted_mv=1;
2159     s->decode=1; //FIXME
2160
2161     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2162     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2163 }
2164
2165 static av_cold int decode_init(AVCodecContext *avctx){
2166     H264Context *h= avctx->priv_data;
2167     MpegEncContext * const s = &h->s;
2168
2169     MPV_decode_defaults(s);
2170
2171     s->avctx = avctx;
2172     common_init(h);
2173
2174     s->out_format = FMT_H264;
2175     s->workaround_bugs= avctx->workaround_bugs;
2176
2177     // set defaults
2178 //    s->decode_mb= ff_h263_decode_mb;
2179     s->quarter_sample = 1;
2180     s->low_delay= 1;
2181
2182     if(avctx->codec_id == CODEC_ID_SVQ3)
2183         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2184     else
2185         avctx->pix_fmt= PIX_FMT_YUV420P;
2186
2187     decode_init_vlc();
2188
2189     if(avctx->extradata_size > 0 && avctx->extradata &&
2190        *(char *)avctx->extradata == 1){
2191         h->is_avc = 1;
2192         h->got_avcC = 0;
2193     } else {
2194         h->is_avc = 0;
2195     }
2196
2197     h->thread_context[0] = h;
2198     h->outputed_poc = INT_MIN;
2199     return 0;
2200 }
2201
2202 static int frame_start(H264Context *h){
2203     MpegEncContext * const s = &h->s;
2204     int i;
2205
2206     if(MPV_frame_start(s, s->avctx) < 0)
2207         return -1;
2208     ff_er_frame_start(s);
2209     /*
2210      * MPV_frame_start uses pict_type to derive key_frame.
2211      * This is incorrect for H.264; IDR markings must be used.
2212      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2213      * See decode_nal_units().
2214      */
2215     s->current_picture_ptr->key_frame= 0;
2216
2217     assert(s->linesize && s->uvlinesize);
2218
2219     for(i=0; i<16; i++){
2220         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2221         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2222     }
2223     for(i=0; i<4; i++){
2224         h->block_offset[16+i]=
2225         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2226         h->block_offset[24+16+i]=
2227         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2228     }
2229
2230     /* can't be in alloc_tables because linesize isn't known there.
2231      * FIXME: redo bipred weight to not require extra buffer? */
2232     for(i = 0; i < s->avctx->thread_count; i++)
2233         if(!h->thread_context[i]->s.obmc_scratchpad)
2234             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2235
2236     /* some macroblocks will be accessed before they're available */
2237     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2238         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2239
2240 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2241
2242     // We mark the current picture as non-reference after allocating it, so
2243     // that if we break out due to an error it can be released automatically
2244     // in the next MPV_frame_start().
2245     // SVQ3 as well as most other codecs have only last/next/current and thus
2246     // get released even with set reference, besides SVQ3 and others do not
2247     // mark frames as reference later "naturally".
2248     if(s->codec_id != CODEC_ID_SVQ3)
2249         s->current_picture_ptr->reference= 0;
2250
2251     s->current_picture_ptr->field_poc[0]=
2252     s->current_picture_ptr->field_poc[1]= INT_MAX;
2253     assert(s->current_picture_ptr->long_ref==0);
2254
2255     return 0;
2256 }
2257
2258 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2259     MpegEncContext * const s = &h->s;
2260     int i;
2261
2262     src_y  -=   linesize;
2263     src_cb -= uvlinesize;
2264     src_cr -= uvlinesize;
2265
2266     // There are two lines saved, the line above the the top macroblock of a pair,
2267     // and the line above the bottom macroblock
2268     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2269     for(i=1; i<17; i++){
2270         h->left_border[i]= src_y[15+i*  linesize];
2271     }
2272
2273     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2274     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2275
2276     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2277         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2278         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2279         for(i=1; i<9; i++){
2280             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2281             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2282         }
2283         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2284         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2285     }
2286 }
2287
2288 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2289     MpegEncContext * const s = &h->s;
2290     int temp8, i;
2291     uint64_t temp64;
2292     int deblock_left;
2293     int deblock_top;
2294     int mb_xy;
2295
2296     if(h->deblocking_filter == 2) {
2297         mb_xy = h->mb_xy;
2298         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2299         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2300     } else {
2301         deblock_left = (s->mb_x > 0);
2302         deblock_top =  (s->mb_y > 0);
2303     }
2304
2305     src_y  -=   linesize + 1;
2306     src_cb -= uvlinesize + 1;
2307     src_cr -= uvlinesize + 1;
2308
2309 #define XCHG(a,b,t,xchg)\
2310 t= a;\
2311 if(xchg)\
2312     a= b;\
2313 b= t;
2314
2315     if(deblock_left){
2316         for(i = !deblock_top; i<17; i++){
2317             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2318         }
2319     }
2320
2321     if(deblock_top){
2322         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2323         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2324         if(s->mb_x+1 < s->mb_width){
2325             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2326         }
2327     }
2328
2329     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2330         if(deblock_left){
2331             for(i = !deblock_top; i<9; i++){
2332                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2333                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2334             }
2335         }
2336         if(deblock_top){
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2338             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2339         }
2340     }
2341 }
2342
2343 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2344     MpegEncContext * const s = &h->s;
2345     int i;
2346
2347     src_y  -= 2 *   linesize;
2348     src_cb -= 2 * uvlinesize;
2349     src_cr -= 2 * uvlinesize;
2350
2351     // There are two lines saved, the line above the the top macroblock of a pair,
2352     // and the line above the bottom macroblock
2353     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2354     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2355     for(i=2; i<34; i++){
2356         h->left_border[i]= src_y[15+i*  linesize];
2357     }
2358
2359     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2360     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2361     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2362     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2363
2364     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2365         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2366         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2367         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2368         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2369         for(i=2; i<18; i++){
2370             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2371             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2372         }
2373         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2374         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2375         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2376         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2377     }
2378 }
2379
2380 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2381     MpegEncContext * const s = &h->s;
2382     int temp8, i;
2383     uint64_t temp64;
2384     int deblock_left = (s->mb_x > 0);
2385     int deblock_top  = (s->mb_y > 1);
2386
2387     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2388
2389     src_y  -= 2 *   linesize + 1;
2390     src_cb -= 2 * uvlinesize + 1;
2391     src_cr -= 2 * uvlinesize + 1;
2392
2393 #define XCHG(a,b,t,xchg)\
2394 t= a;\
2395 if(xchg)\
2396     a= b;\
2397 b= t;
2398
2399     if(deblock_left){
2400         for(i = (!deblock_top)<<1; i<34; i++){
2401             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2402         }
2403     }
2404
2405     if(deblock_top){
2406         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2407         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2408         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2409         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2410         if(s->mb_x+1 < s->mb_width){
2411             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2412             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2413         }
2414     }
2415
2416     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2417         if(deblock_left){
2418             for(i = (!deblock_top) << 1; i<18; i++){
2419                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2420                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2421             }
2422         }
2423         if(deblock_top){
2424             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2425             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2426             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2427             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2428         }
2429     }
2430 }
2431
2432 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2433     MpegEncContext * const s = &h->s;
2434     const int mb_x= s->mb_x;
2435     const int mb_y= s->mb_y;
2436     const int mb_xy= h->mb_xy;
2437     const int mb_type= s->current_picture.mb_type[mb_xy];
2438     uint8_t  *dest_y, *dest_cb, *dest_cr;
2439     int linesize, uvlinesize /*dct_offset*/;
2440     int i;
2441     int *block_offset = &h->block_offset[0];
2442     const unsigned int bottom = mb_y & 1;
2443     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2444     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2445     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2446
2447     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2448     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2449     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2450
2451     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2452     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2453
2454     if (!simple && MB_FIELD) {
2455         linesize   = h->mb_linesize   = s->linesize * 2;
2456         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2457         block_offset = &h->block_offset[24];
2458         if(mb_y&1){ //FIXME move out of this function?
2459             dest_y -= s->linesize*15;
2460             dest_cb-= s->uvlinesize*7;
2461             dest_cr-= s->uvlinesize*7;
2462         }
2463         if(FRAME_MBAFF) {
2464             int list;
2465             for(list=0; list<h->list_count; list++){
2466                 if(!USES_LIST(mb_type, list))
2467                     continue;
2468                 if(IS_16X16(mb_type)){
2469                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2470                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2471                 }else{
2472                     for(i=0; i<16; i+=4){
2473                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2474                         int ref = h->ref_cache[list][scan8[i]];
2475                         if(ref >= 0)
2476                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2477                     }
2478                 }
2479             }
2480         }
2481     } else {
2482         linesize   = h->mb_linesize   = s->linesize;
2483         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2484 //        dct_offset = s->linesize * 16;
2485     }
2486
2487     if(transform_bypass){
2488         idct_dc_add =
2489         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2490     }else if(IS_8x8DCT(mb_type)){
2491         idct_dc_add = s->dsp.h264_idct8_dc_add;
2492         idct_add = s->dsp.h264_idct8_add;
2493     }else{
2494         idct_dc_add = s->dsp.h264_idct_dc_add;
2495         idct_add = s->dsp.h264_idct_add;
2496     }
2497
2498     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2499        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2500         int mbt_y = mb_y&~1;
2501         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2502         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2503         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2504         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2505     }
2506
2507     if (!simple && IS_INTRA_PCM(mb_type)) {
2508         for (i=0; i<16; i++) {
2509             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2510         }
2511         for (i=0; i<8; i++) {
2512             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2513             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2514         }
2515     } else {
2516         if(IS_INTRA(mb_type)){
2517             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2518                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2519
2520             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2521                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2522                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2523             }
2524
2525             if(IS_INTRA4x4(mb_type)){
2526                 if(simple || !s->encoding){
2527                     if(IS_8x8DCT(mb_type)){
2528                         for(i=0; i<16; i+=4){
2529                             uint8_t * const ptr= dest_y + block_offset[i];
2530                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2531                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2532                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2533                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2534                             if(nnz){
2535                                 if(nnz == 1 && h->mb[i*16])
2536                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2537                                 else
2538                                     idct_add(ptr, h->mb + i*16, linesize);
2539                             }
2540                         }
2541                     }else
2542                     for(i=0; i<16; i++){
2543                         uint8_t * const ptr= dest_y + block_offset[i];
2544                         uint8_t *topright;
2545                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2546                         int nnz, tr;
2547
2548                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2549                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2550                             assert(mb_y || linesize <= block_offset[i]);
2551                             if(!topright_avail){
2552                                 tr= ptr[3 - linesize]*0x01010101;
2553                                 topright= (uint8_t*) &tr;
2554                             }else
2555                                 topright= ptr + 4 - linesize;
2556                         }else
2557                             topright= NULL;
2558
2559                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2560                         nnz = h->non_zero_count_cache[ scan8[i] ];
2561                         if(nnz){
2562                             if(is_h264){
2563                                 if(nnz == 1 && h->mb[i*16])
2564                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2565                                 else
2566                                     idct_add(ptr, h->mb + i*16, linesize);
2567                             }else
2568                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2569                         }
2570                     }
2571                 }
2572             }else{
2573                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2574                 if(is_h264){
2575                     if(!transform_bypass)
2576                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2577                 }else
2578                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2579             }
2580             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2581                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2582         }else if(is_h264){
2583             hl_motion(h, dest_y, dest_cb, dest_cr,
2584                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2585                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2586                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2587         }
2588
2589
2590         if(!IS_INTRA4x4(mb_type)){
2591             if(is_h264){
2592                 if(IS_INTRA16x16(mb_type)){
2593                     for(i=0; i<16; i++){
2594                         if(h->non_zero_count_cache[ scan8[i] ])
2595                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2596                         else if(h->mb[i*16])
2597                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2598                     }
2599                 }else{
2600                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2601                     for(i=0; i<16; i+=di){
2602                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2603                         if(nnz){
2604                             if(nnz==1 && h->mb[i*16])
2605                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2606                             else
2607                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2608                         }
2609                     }
2610                 }
2611             }else{
2612                 for(i=0; i<16; i++){
2613                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2614                         uint8_t * const ptr= dest_y + block_offset[i];
2615                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2616                     }
2617                 }
2618             }
2619         }
2620
2621         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2622             uint8_t *dest[2] = {dest_cb, dest_cr};
2623             if(transform_bypass){
2624                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2625             }else{
2626                 idct_add = s->dsp.h264_idct_add;
2627                 idct_dc_add = s->dsp.h264_idct_dc_add;
2628                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2629                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2630             }
2631             if(is_h264){
2632                 for(i=16; i<16+8; i++){
2633                     if(h->non_zero_count_cache[ scan8[i] ])
2634                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2635                     else if(h->mb[i*16])
2636                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2637                 }
2638             }else{
2639                 for(i=16; i<16+8; i++){
2640                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2641                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2642                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2643                     }
2644                 }
2645             }
2646         }
2647     }
2648     if(h->deblocking_filter) {
2649         if (!simple && FRAME_MBAFF) {
2650             //FIXME try deblocking one mb at a time?
2651             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2652             const int mb_y = s->mb_y - 1;
2653             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2654             const int mb_xy= mb_x + mb_y*s->mb_stride;
2655             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2656             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2657             if (!bottom) return;
2658             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2659             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2660             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2661
2662             if(IS_INTRA(mb_type_top | mb_type_bottom))
2663                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2664
2665             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2666             // deblock a pair
2667             // top
2668             s->mb_y--; h->mb_xy -= s->mb_stride;
2669             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2670             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2671             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2672             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2673             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2674             // bottom
2675             s->mb_y++; h->mb_xy += s->mb_stride;
2676             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2677             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2678             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2679             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2680             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2681         } else {
2682             tprintf(h->s.avctx, "call filter_mb\n");
2683             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2684             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2685             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2686             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2687             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2688         }
2689     }
2690 }
2691
2692 /**
2693  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2694  */
2695 static void hl_decode_mb_simple(H264Context *h){
2696     hl_decode_mb_internal(h, 1);
2697 }
2698
2699 /**
2700  * Process a macroblock; this handles edge cases, such as interlacing.
2701  */
2702 static void av_noinline hl_decode_mb_complex(H264Context *h){
2703     hl_decode_mb_internal(h, 0);
2704 }
2705
2706 static void hl_decode_mb(H264Context *h){
2707     MpegEncContext * const s = &h->s;
2708     const int mb_xy= h->mb_xy;
2709     const int mb_type= s->current_picture.mb_type[mb_xy];
2710     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2711                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2712
2713     if(ENABLE_H264_ENCODER && !s->decode)
2714         return;
2715
2716     if (is_complex)
2717         hl_decode_mb_complex(h);
2718     else hl_decode_mb_simple(h);
2719 }
2720
2721 static void pic_as_field(Picture *pic, const int parity){
2722     int i;
2723     for (i = 0; i < 4; ++i) {
2724         if (parity == PICT_BOTTOM_FIELD)
2725             pic->data[i] += pic->linesize[i];
2726         pic->reference = parity;
2727         pic->linesize[i] *= 2;
2728     }
2729     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2730 }
2731
2732 static int split_field_copy(Picture *dest, Picture *src,
2733                             int parity, int id_add){
2734     int match = !!(src->reference & parity);
2735
2736     if (match) {
2737         *dest = *src;
2738         if(parity != PICT_FRAME){
2739             pic_as_field(dest, parity);
2740             dest->pic_id *= 2;
2741             dest->pic_id += id_add;
2742         }
2743     }
2744
2745     return match;
2746 }
2747
2748 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2749     int i[2]={0};
2750     int index=0;
2751
2752     while(i[0]<len || i[1]<len){
2753         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2754             i[0]++;
2755         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2756             i[1]++;
2757         if(i[0] < len){
2758             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2759             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2760         }
2761         if(i[1] < len){
2762             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2763             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2764         }
2765     }
2766
2767     return index;
2768 }
2769
2770 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2771     int i, best_poc;
2772     int out_i= 0;
2773
2774     for(;;){
2775         best_poc= dir ? INT_MIN : INT_MAX;
2776
2777         for(i=0; i<len; i++){
2778             const int poc= src[i]->poc;
2779             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2780                 best_poc= poc;
2781                 sorted[out_i]= src[i];
2782             }
2783         }
2784         if(best_poc == (dir ? INT_MIN : INT_MAX))
2785             break;
2786         limit= sorted[out_i++]->poc - dir;
2787     }
2788     return out_i;
2789 }
2790
2791 /**
2792  * fills the default_ref_list.
2793  */
2794 static int fill_default_ref_list(H264Context *h){
2795     MpegEncContext * const s = &h->s;
2796     int i, len;
2797
2798     if(h->slice_type_nos==FF_B_TYPE){
2799         Picture *sorted[32];
2800         int cur_poc, list;
2801         int lens[2];
2802
2803         if(FIELD_PICTURE)
2804             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2805         else
2806             cur_poc= s->current_picture_ptr->poc;
2807
2808         for(list= 0; list<2; list++){
2809             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2810             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2811             assert(len<=32);
2812             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2813             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2814             assert(len<=32);
2815
2816             if(len < h->ref_count[list])
2817                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2818             lens[list]= len;
2819         }
2820
2821         if(lens[0] == lens[1] && lens[1] > 1){
2822             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2823             if(i == lens[0])
2824                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2825         }
2826     }else{
2827         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2828         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2829         assert(len <= 32);
2830         if(len < h->ref_count[0])
2831             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2832     }
2833 #ifdef TRACE
2834     for (i=0; i<h->ref_count[0]; i++) {
2835         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2836     }
2837     if(h->slice_type_nos==FF_B_TYPE){
2838         for (i=0; i<h->ref_count[1]; i++) {
2839             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2840         }
2841     }
2842 #endif
2843     return 0;
2844 }
2845
2846 static void print_short_term(H264Context *h);
2847 static void print_long_term(H264Context *h);
2848
2849 /**
2850  * Extract structure information about the picture described by pic_num in
2851  * the current decoding context (frame or field). Note that pic_num is
2852  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2853  * @param pic_num picture number for which to extract structure information
2854  * @param structure one of PICT_XXX describing structure of picture
2855  *                      with pic_num
2856  * @return frame number (short term) or long term index of picture
2857  *         described by pic_num
2858  */
2859 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2860     MpegEncContext * const s = &h->s;
2861
2862     *structure = s->picture_structure;
2863     if(FIELD_PICTURE){
2864         if (!(pic_num & 1))
2865             /* opposite field */
2866             *structure ^= PICT_FRAME;
2867         pic_num >>= 1;
2868     }
2869
2870     return pic_num;
2871 }
2872
2873 static int decode_ref_pic_list_reordering(H264Context *h){
2874     MpegEncContext * const s = &h->s;
2875     int list, index, pic_structure;
2876
2877     print_short_term(h);
2878     print_long_term(h);
2879
2880     for(list=0; list<h->list_count; list++){
2881         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2882
2883         if(get_bits1(&s->gb)){
2884             int pred= h->curr_pic_num;
2885
2886             for(index=0; ; index++){
2887                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2888                 unsigned int pic_id;
2889                 int i;
2890                 Picture *ref = NULL;
2891
2892                 if(reordering_of_pic_nums_idc==3)
2893                     break;
2894
2895                 if(index >= h->ref_count[list]){
2896                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2897                     return -1;
2898                 }
2899
2900                 if(reordering_of_pic_nums_idc<3){
2901                     if(reordering_of_pic_nums_idc<2){
2902                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2903                         int frame_num;
2904
2905                         if(abs_diff_pic_num > h->max_pic_num){
2906                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2907                             return -1;
2908                         }
2909
2910                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2911                         else                                pred+= abs_diff_pic_num;
2912                         pred &= h->max_pic_num - 1;
2913
2914                         frame_num = pic_num_extract(h, pred, &pic_structure);
2915
2916                         for(i= h->short_ref_count-1; i>=0; i--){
2917                             ref = h->short_ref[i];
2918                             assert(ref->reference);
2919                             assert(!ref->long_ref);
2920                             if(
2921                                    ref->frame_num == frame_num &&
2922                                    (ref->reference & pic_structure)
2923                               )
2924                                 break;
2925                         }
2926                         if(i>=0)
2927                             ref->pic_id= pred;
2928                     }else{
2929                         int long_idx;
2930                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2931
2932                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2933
2934                         if(long_idx>31){
2935                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2936                             return -1;
2937                         }
2938                         ref = h->long_ref[long_idx];
2939                         assert(!(ref && !ref->reference));
2940                         if(ref && (ref->reference & pic_structure)){
2941                             ref->pic_id= pic_id;
2942                             assert(ref->long_ref);
2943                             i=0;
2944                         }else{
2945                             i=-1;
2946                         }
2947                     }
2948
2949                     if (i < 0) {
2950                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2951                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2952                     } else {
2953                         for(i=index; i+1<h->ref_count[list]; i++){
2954                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2955                                 break;
2956                         }
2957                         for(; i > index; i--){
2958                             h->ref_list[list][i]= h->ref_list[list][i-1];
2959                         }
2960                         h->ref_list[list][index]= *ref;
2961                         if (FIELD_PICTURE){
2962                             pic_as_field(&h->ref_list[list][index], pic_structure);
2963                         }
2964                     }
2965                 }else{
2966                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2967                     return -1;
2968                 }
2969             }
2970         }
2971     }
2972     for(list=0; list<h->list_count; list++){
2973         for(index= 0; index < h->ref_count[list]; index++){
2974             if(!h->ref_list[list][index].data[0]){
2975                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2976                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2977             }
2978         }
2979     }
2980
2981     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
2982         direct_dist_scale_factor(h);
2983     direct_ref_list_init(h);
2984     return 0;
2985 }
2986
2987 static void fill_mbaff_ref_list(H264Context *h){
2988     int list, i, j;
2989     for(list=0; list<2; list++){ //FIXME try list_count
2990         for(i=0; i<h->ref_count[list]; i++){
2991             Picture *frame = &h->ref_list[list][i];
2992             Picture *field = &h->ref_list[list][16+2*i];
2993             field[0] = *frame;
2994             for(j=0; j<3; j++)
2995                 field[0].linesize[j] <<= 1;
2996             field[0].reference = PICT_TOP_FIELD;
2997             field[1] = field[0];
2998             for(j=0; j<3; j++)
2999                 field[1].data[j] += frame->linesize[j];
3000             field[1].reference = PICT_BOTTOM_FIELD;
3001
3002             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3003             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3004             for(j=0; j<2; j++){
3005                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3006                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3007             }
3008         }
3009     }
3010     for(j=0; j<h->ref_count[1]; j++){
3011         for(i=0; i<h->ref_count[0]; i++)
3012             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3013         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3014         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3015     }
3016 }
3017
3018 static int pred_weight_table(H264Context *h){
3019     MpegEncContext * const s = &h->s;
3020     int list, i;
3021     int luma_def, chroma_def;
3022
3023     h->use_weight= 0;
3024     h->use_weight_chroma= 0;
3025     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3026     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3027     luma_def = 1<<h->luma_log2_weight_denom;
3028     chroma_def = 1<<h->chroma_log2_weight_denom;
3029
3030     for(list=0; list<2; list++){
3031         for(i=0; i<h->ref_count[list]; i++){
3032             int luma_weight_flag, chroma_weight_flag;
3033
3034             luma_weight_flag= get_bits1(&s->gb);
3035             if(luma_weight_flag){
3036                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3037                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3038                 if(   h->luma_weight[list][i] != luma_def
3039                    || h->luma_offset[list][i] != 0)
3040                     h->use_weight= 1;
3041             }else{
3042                 h->luma_weight[list][i]= luma_def;
3043                 h->luma_offset[list][i]= 0;
3044             }
3045
3046             if(CHROMA){
3047                 chroma_weight_flag= get_bits1(&s->gb);
3048                 if(chroma_weight_flag){
3049                     int j;
3050                     for(j=0; j<2; j++){
3051                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3052                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3053                         if(   h->chroma_weight[list][i][j] != chroma_def
3054                         || h->chroma_offset[list][i][j] != 0)
3055                             h->use_weight_chroma= 1;
3056                     }
3057                 }else{
3058                     int j;
3059                     for(j=0; j<2; j++){
3060                         h->chroma_weight[list][i][j]= chroma_def;
3061                         h->chroma_offset[list][i][j]= 0;
3062                     }
3063                 }
3064             }
3065         }
3066         if(h->slice_type_nos != FF_B_TYPE) break;
3067     }
3068     h->use_weight= h->use_weight || h->use_weight_chroma;
3069     return 0;
3070 }
3071
3072 static void implicit_weight_table(H264Context *h){
3073     MpegEncContext * const s = &h->s;
3074     int ref0, ref1;
3075     int cur_poc = s->current_picture_ptr->poc;
3076
3077     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3078        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3079         h->use_weight= 0;
3080         h->use_weight_chroma= 0;
3081         return;
3082     }
3083
3084     h->use_weight= 2;
3085     h->use_weight_chroma= 2;
3086     h->luma_log2_weight_denom= 5;
3087     h->chroma_log2_weight_denom= 5;
3088
3089     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3090         int poc0 = h->ref_list[0][ref0].poc;
3091         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3092             int poc1 = h->ref_list[1][ref1].poc;
3093             int td = av_clip(poc1 - poc0, -128, 127);
3094             if(td){
3095                 int tb = av_clip(cur_poc - poc0, -128, 127);
3096                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3097                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3098                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3099                     h->implicit_weight[ref0][ref1] = 32;
3100                 else
3101                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3102             }else
3103                 h->implicit_weight[ref0][ref1] = 32;
3104         }
3105     }
3106 }
3107
3108 /**
3109  * Mark a picture as no longer needed for reference. The refmask
3110  * argument allows unreferencing of individual fields or the whole frame.
3111  * If the picture becomes entirely unreferenced, but is being held for
3112  * display purposes, it is marked as such.
3113  * @param refmask mask of fields to unreference; the mask is bitwise
3114  *                anded with the reference marking of pic
3115  * @return non-zero if pic becomes entirely unreferenced (except possibly
3116  *         for display purposes) zero if one of the fields remains in
3117  *         reference
3118  */
3119 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3120     int i;
3121     if (pic->reference &= refmask) {
3122         return 0;
3123     } else {
3124         for(i = 0; h->delayed_pic[i]; i++)
3125             if(pic == h->delayed_pic[i]){
3126                 pic->reference=DELAYED_PIC_REF;
3127                 break;
3128             }
3129         return 1;
3130     }
3131 }
3132
3133 /**
3134  * instantaneous decoder refresh.
3135  */
3136 static void idr(H264Context *h){
3137     int i;
3138
3139     for(i=0; i<16; i++){
3140         remove_long(h, i, 0);
3141     }
3142     assert(h->long_ref_count==0);
3143
3144     for(i=0; i<h->short_ref_count; i++){
3145         unreference_pic(h, h->short_ref[i], 0);
3146         h->short_ref[i]= NULL;
3147     }
3148     h->short_ref_count=0;
3149     h->prev_frame_num= 0;
3150     h->prev_frame_num_offset= 0;
3151     h->prev_poc_msb=
3152     h->prev_poc_lsb= 0;
3153 }
3154
3155 /* forget old pics after a seek */
3156 static void flush_dpb(AVCodecContext *avctx){
3157     H264Context *h= avctx->priv_data;
3158     int i;
3159     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3160         if(h->delayed_pic[i])
3161             h->delayed_pic[i]->reference= 0;
3162         h->delayed_pic[i]= NULL;
3163     }
3164     h->outputed_poc= INT_MIN;
3165     idr(h);
3166     if(h->s.current_picture_ptr)
3167         h->s.current_picture_ptr->reference= 0;
3168     h->s.first_field= 0;
3169     ff_mpeg_flush(avctx);
3170 }
3171
3172 /**
3173  * Find a Picture in the short term reference list by frame number.
3174  * @param frame_num frame number to search for
3175  * @param idx the index into h->short_ref where returned picture is found
3176  *            undefined if no picture found.
3177  * @return pointer to the found picture, or NULL if no pic with the provided
3178  *                 frame number is found
3179  */
3180 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3181     MpegEncContext * const s = &h->s;
3182     int i;
3183
3184     for(i=0; i<h->short_ref_count; i++){
3185         Picture *pic= h->short_ref[i];
3186         if(s->avctx->debug&FF_DEBUG_MMCO)
3187             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3188         if(pic->frame_num == frame_num) {
3189             *idx = i;
3190             return pic;
3191         }
3192     }
3193     return NULL;
3194 }
3195
3196 /**
3197  * Remove a picture from the short term reference list by its index in
3198  * that list.  This does no checking on the provided index; it is assumed
3199  * to be valid. Other list entries are shifted down.
3200  * @param i index into h->short_ref of picture to remove.
3201  */
3202 static void remove_short_at_index(H264Context *h, int i){
3203     assert(i >= 0 && i < h->short_ref_count);
3204     h->short_ref[i]= NULL;
3205     if (--h->short_ref_count)
3206         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3207 }
3208
3209 /**
3210  *
3211  * @return the removed picture or NULL if an error occurs
3212  */
3213 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3214     MpegEncContext * const s = &h->s;
3215     Picture *pic;
3216     int i;
3217
3218     if(s->avctx->debug&FF_DEBUG_MMCO)
3219         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3220
3221     pic = find_short(h, frame_num, &i);
3222     if (pic){
3223         if(unreference_pic(h, pic, ref_mask))
3224         remove_short_at_index(h, i);
3225     }
3226
3227     return pic;
3228 }
3229
3230 /**
3231  * Remove a picture from the long term reference list by its index in
3232  * that list.
3233  * @return the removed picture or NULL if an error occurs
3234  */
3235 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3236     Picture *pic;
3237
3238     pic= h->long_ref[i];
3239     if (pic){
3240         if(unreference_pic(h, pic, ref_mask)){
3241             assert(h->long_ref[i]->long_ref == 1);
3242             h->long_ref[i]->long_ref= 0;
3243             h->long_ref[i]= NULL;
3244             h->long_ref_count--;
3245         }
3246     }
3247
3248     return pic;
3249 }
3250
3251 /**
3252  * print short term list
3253  */
3254 static void print_short_term(H264Context *h) {
3255     uint32_t i;
3256     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3257         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3258         for(i=0; i<h->short_ref_count; i++){
3259             Picture *pic= h->short_ref[i];
3260             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3261         }
3262     }
3263 }
3264
3265 /**
3266  * print long term list
3267  */
3268 static void print_long_term(H264Context *h) {
3269     uint32_t i;
3270     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3271         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3272         for(i = 0; i < 16; i++){
3273             Picture *pic= h->long_ref[i];
3274             if (pic) {
3275                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3276             }
3277         }
3278     }
3279 }
3280
3281 /**
3282  * Executes the reference picture marking (memory management control operations).
3283  */
3284 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3285     MpegEncContext * const s = &h->s;
3286     int i, j;
3287     int current_ref_assigned=0;
3288     Picture *pic;
3289
3290     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3291         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3292
3293     for(i=0; i<mmco_count; i++){
3294         int structure, frame_num;
3295         if(s->avctx->debug&FF_DEBUG_MMCO)
3296             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3297
3298         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3299            || mmco[i].opcode == MMCO_SHORT2LONG){
3300             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3301             pic = find_short(h, frame_num, &j);
3302             if(!pic){
3303                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3304                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3305                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3306                 continue;
3307             }
3308         }
3309
3310         switch(mmco[i].opcode){
3311         case MMCO_SHORT2UNUSED:
3312             if(s->avctx->debug&FF_DEBUG_MMCO)
3313                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3314             remove_short(h, frame_num, structure ^ PICT_FRAME);
3315             break;
3316         case MMCO_SHORT2LONG:
3317                 if (h->long_ref[mmco[i].long_arg] != pic)
3318                     remove_long(h, mmco[i].long_arg, 0);
3319
3320                 remove_short_at_index(h, j);
3321                 h->long_ref[ mmco[i].long_arg ]= pic;
3322                 if (h->long_ref[ mmco[i].long_arg ]){
3323                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3324                     h->long_ref_count++;
3325                 }
3326             break;
3327         case MMCO_LONG2UNUSED:
3328             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3329             pic = h->long_ref[j];
3330             if (pic) {
3331                 remove_long(h, j, structure ^ PICT_FRAME);
3332             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3333                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3334             break;
3335         case MMCO_LONG:
3336                     // Comment below left from previous code as it is an interresting note.
3337                     /* First field in pair is in short term list or
3338                      * at a different long term index.
3339                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3340                      * Report the problem and keep the pair where it is,
3341                      * and mark this field valid.
3342                      */
3343
3344             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3345                 remove_long(h, mmco[i].long_arg, 0);
3346
3347                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3348                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3349                 h->long_ref_count++;
3350             }
3351
3352             s->current_picture_ptr->reference |= s->picture_structure;
3353             current_ref_assigned=1;
3354             break;
3355         case MMCO_SET_MAX_LONG:
3356             assert(mmco[i].long_arg <= 16);
3357             // just remove the long term which index is greater than new max
3358             for(j = mmco[i].long_arg; j<16; j++){
3359                 remove_long(h, j, 0);
3360             }
3361             break;
3362         case MMCO_RESET:
3363             while(h->short_ref_count){
3364                 remove_short(h, h->short_ref[0]->frame_num, 0);
3365             }
3366             for(j = 0; j < 16; j++) {
3367                 remove_long(h, j, 0);
3368             }
3369             s->current_picture_ptr->poc=
3370             s->current_picture_ptr->field_poc[0]=
3371             s->current_picture_ptr->field_poc[1]=
3372             h->poc_lsb=
3373             h->poc_msb=
3374             h->frame_num=
3375             s->current_picture_ptr->frame_num= 0;
3376             break;
3377         default: assert(0);
3378         }
3379     }
3380
3381     if (!current_ref_assigned) {
3382         /* Second field of complementary field pair; the first field of
3383          * which is already referenced. If short referenced, it
3384          * should be first entry in short_ref. If not, it must exist
3385          * in long_ref; trying to put it on the short list here is an
3386          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3387          */
3388         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3389             /* Just mark the second field valid */
3390             s->current_picture_ptr->reference = PICT_FRAME;
3391         } else if (s->current_picture_ptr->long_ref) {
3392             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3393                                              "assignment for second field "
3394                                              "in complementary field pair "
3395                                              "(first field is long term)\n");
3396         } else {
3397             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3398             if(pic){
3399                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3400             }
3401
3402             if(h->short_ref_count)
3403                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3404
3405             h->short_ref[0]= s->current_picture_ptr;
3406             h->short_ref_count++;
3407             s->current_picture_ptr->reference |= s->picture_structure;
3408         }
3409     }
3410
3411     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3412
3413         /* We have too many reference frames, probably due to corrupted
3414          * stream. Need to discard one frame. Prevents overrun of the
3415          * short_ref and long_ref buffers.
3416          */
3417         av_log(h->s.avctx, AV_LOG_ERROR,
3418                "number of reference frames exceeds max (probably "
3419                "corrupt input), discarding one\n");
3420
3421         if (h->long_ref_count && !h->short_ref_count) {
3422             for (i = 0; i < 16; ++i)
3423                 if (h->long_ref[i])
3424                     break;
3425
3426             assert(i < 16);
3427             remove_long(h, i, 0);
3428         } else {
3429             pic = h->short_ref[h->short_ref_count - 1];
3430             remove_short(h, pic->frame_num, 0);
3431         }
3432     }
3433
3434     print_short_term(h);
3435     print_long_term(h);
3436     return 0;
3437 }
3438
3439 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3440     MpegEncContext * const s = &h->s;
3441     int i;
3442
3443     h->mmco_index= 0;
3444     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3445         s->broken_link= get_bits1(gb) -1;
3446         if(get_bits1(gb)){
3447             h->mmco[0].opcode= MMCO_LONG;
3448             h->mmco[0].long_arg= 0;
3449             h->mmco_index= 1;
3450         }
3451     }else{
3452         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3453             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3454                 MMCOOpcode opcode= get_ue_golomb(gb);
3455
3456                 h->mmco[i].opcode= opcode;
3457                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3458                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3459 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3460                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3461                         return -1;
3462                     }*/
3463                 }
3464                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3465                     unsigned int long_arg= get_ue_golomb(gb);
3466                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3467                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3468                         return -1;
3469                     }
3470                     h->mmco[i].long_arg= long_arg;
3471                 }
3472
3473                 if(opcode > (unsigned)MMCO_LONG){
3474                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3475                     return -1;
3476                 }
3477                 if(opcode == MMCO_END)
3478                     break;
3479             }
3480             h->mmco_index= i;
3481         }else{
3482             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3483
3484             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3485                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3486                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3487                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3488                 h->mmco_index= 1;
3489                 if (FIELD_PICTURE) {
3490                     h->mmco[0].short_pic_num *= 2;
3491                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3492                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3493                     h->mmco_index= 2;
3494                 }
3495             }
3496         }
3497     }
3498
3499     return 0;
3500 }
3501
3502 static int init_poc(H264Context *h){
3503     MpegEncContext * const s = &h->s;
3504     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3505     int field_poc[2];
3506     Picture *cur = s->current_picture_ptr;
3507
3508     h->frame_num_offset= h->prev_frame_num_offset;
3509     if(h->frame_num < h->prev_frame_num)
3510         h->frame_num_offset += max_frame_num;
3511
3512     if(h->sps.poc_type==0){
3513         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3514
3515         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3516             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3517         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3518             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3519         else
3520             h->poc_msb = h->prev_poc_msb;
3521 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3522         field_poc[0] =
3523         field_poc[1] = h->poc_msb + h->poc_lsb;
3524         if(s->picture_structure == PICT_FRAME)
3525             field_poc[1] += h->delta_poc_bottom;
3526     }else if(h->sps.poc_type==1){
3527         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3528         int i;
3529
3530         if(h->sps.poc_cycle_length != 0)
3531             abs_frame_num = h->frame_num_offset + h->frame_num;
3532         else
3533             abs_frame_num = 0;
3534
3535         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3536             abs_frame_num--;
3537
3538         expected_delta_per_poc_cycle = 0;
3539         for(i=0; i < h->sps.poc_cycle_length; i++)
3540             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3541
3542         if(abs_frame_num > 0){
3543             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3544             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3545
3546             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3547             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3548                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3549         } else
3550             expectedpoc = 0;
3551
3552         if(h->nal_ref_idc == 0)
3553             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3554
3555         field_poc[0] = expectedpoc + h->delta_poc[0];
3556         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3557
3558         if(s->picture_structure == PICT_FRAME)
3559             field_poc[1] += h->delta_poc[1];
3560     }else{
3561         int poc= 2*(h->frame_num_offset + h->frame_num);
3562
3563         if(!h->nal_ref_idc)
3564             poc--;
3565
3566         field_poc[0]= poc;
3567         field_poc[1]= poc;
3568     }
3569
3570     if(s->picture_structure != PICT_BOTTOM_FIELD)
3571         s->current_picture_ptr->field_poc[0]= field_poc[0];
3572     if(s->picture_structure != PICT_TOP_FIELD)
3573         s->current_picture_ptr->field_poc[1]= field_poc[1];
3574     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3575
3576     return 0;
3577 }
3578
3579
3580 /**
3581  * initialize scan tables
3582  */
3583 static void init_scan_tables(H264Context *h){
3584     MpegEncContext * const s = &h->s;
3585     int i;
3586     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3587         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3588         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3589     }else{
3590         for(i=0; i<16; i++){
3591 #define T(x) (x>>2) | ((x<<2) & 0xF)
3592             h->zigzag_scan[i] = T(zigzag_scan[i]);
3593             h-> field_scan[i] = T( field_scan[i]);
3594 #undef T
3595         }
3596     }
3597     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3598         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3599         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3600         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3601         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3602     }else{
3603         for(i=0; i<64; i++){
3604 #define T(x) (x>>3) | ((x&7)<<3)
3605             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3606             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3607             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3608             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3609 #undef T
3610         }
3611     }
3612     if(h->sps.transform_bypass){ //FIXME same ugly
3613         h->zigzag_scan_q0          = zigzag_scan;
3614         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3615         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3616         h->field_scan_q0           = field_scan;
3617         h->field_scan8x8_q0        = field_scan8x8;
3618         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3619     }else{
3620         h->zigzag_scan_q0          = h->zigzag_scan;
3621         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3622         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3623         h->field_scan_q0           = h->field_scan;
3624         h->field_scan8x8_q0        = h->field_scan8x8;
3625         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3626     }
3627 }
3628
3629 /**
3630  * Replicates H264 "master" context to thread contexts.
3631  */
3632 static void clone_slice(H264Context *dst, H264Context *src)
3633 {
3634     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3635     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3636     dst->s.current_picture      = src->s.current_picture;
3637     dst->s.linesize             = src->s.linesize;
3638     dst->s.uvlinesize           = src->s.uvlinesize;
3639     dst->s.first_field          = src->s.first_field;
3640
3641     dst->prev_poc_msb           = src->prev_poc_msb;
3642     dst->prev_poc_lsb           = src->prev_poc_lsb;
3643     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3644     dst->prev_frame_num         = src->prev_frame_num;
3645     dst->short_ref_count        = src->short_ref_count;
3646
3647     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3648     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3649     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3650     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3651
3652     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3653     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3654 }
3655
3656 /**
3657  * decodes a slice header.
3658  * This will also call MPV_common_init() and frame_start() as needed.
3659  *
3660  * @param h h264context
3661  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3662  *
3663  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3664  */
3665 static int decode_slice_header(H264Context *h, H264Context *h0){
3666     MpegEncContext * const s = &h->s;
3667     MpegEncContext * const s0 = &h0->s;
3668     unsigned int first_mb_in_slice;
3669     unsigned int pps_id;
3670     int num_ref_idx_active_override_flag;
3671     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3672     unsigned int slice_type, tmp, i, j;
3673     int default_ref_list_done = 0;
3674     int last_pic_structure;
3675
3676     s->dropable= h->nal_ref_idc == 0;
3677
3678     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3679         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3680         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3681     }else{
3682         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3683         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3684     }
3685
3686     first_mb_in_slice= get_ue_golomb(&s->gb);
3687
3688     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3689         h0->current_slice = 0;
3690         if (!s0->first_field)
3691             s->current_picture_ptr= NULL;
3692     }
3693
3694     slice_type= get_ue_golomb(&s->gb);
3695     if(slice_type > 9){
3696         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3697         return -1;
3698     }
3699     if(slice_type > 4){
3700         slice_type -= 5;
3701         h->slice_type_fixed=1;
3702     }else
3703         h->slice_type_fixed=0;
3704
3705     slice_type= slice_type_map[ slice_type ];
3706     if (slice_type == FF_I_TYPE
3707         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3708         default_ref_list_done = 1;
3709     }
3710     h->slice_type= slice_type;
3711     h->slice_type_nos= slice_type & 3;
3712
3713     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3714     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3715         av_log(h->s.avctx, AV_LOG_ERROR,
3716                "B picture before any references, skipping\n");
3717         return -1;
3718     }
3719
3720     pps_id= get_ue_golomb(&s->gb);
3721     if(pps_id>=MAX_PPS_COUNT){
3722         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3723         return -1;
3724     }
3725     if(!h0->pps_buffers[pps_id]) {
3726         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3727         return -1;
3728     }
3729     h->pps= *h0->pps_buffers[pps_id];
3730
3731     if(!h0->sps_buffers[h->pps.sps_id]) {
3732         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3733         return -1;
3734     }
3735     h->sps = *h0->sps_buffers[h->pps.sps_id];
3736
3737     if(h == h0 && h->dequant_coeff_pps != pps_id){
3738         h->dequant_coeff_pps = pps_id;
3739         init_dequant_tables(h);
3740     }
3741
3742     s->mb_width= h->sps.mb_width;
3743     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3744
3745     h->b_stride=  s->mb_width*4;
3746     h->b8_stride= s->mb_width*2;
3747
3748     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3749     if(h->sps.frame_mbs_only_flag)
3750         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3751     else
3752         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3753
3754     if (s->context_initialized
3755         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3756         if(h != h0)
3757             return -1;   // width / height changed during parallelized decoding
3758         free_tables(h);
3759         MPV_common_end(s);
3760     }
3761     if (!s->context_initialized) {
3762         if(h != h0)
3763             return -1;  // we cant (re-)initialize context during parallel decoding
3764         if (MPV_common_init(s) < 0)
3765             return -1;
3766         s->first_field = 0;
3767
3768         init_scan_tables(h);
3769         alloc_tables(h);
3770
3771         for(i = 1; i < s->avctx->thread_count; i++) {
3772             H264Context *c;
3773             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3774             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3775             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3776             c->sps = h->sps;
3777             c->pps = h->pps;
3778             init_scan_tables(c);
3779             clone_tables(c, h);
3780         }
3781
3782         for(i = 0; i < s->avctx->thread_count; i++)
3783             if(context_init(h->thread_context[i]) < 0)
3784                 return -1;
3785
3786         s->avctx->width = s->width;
3787         s->avctx->height = s->height;
3788         s->avctx->sample_aspect_ratio= h->sps.sar;
3789         if(!s->avctx->sample_aspect_ratio.den)
3790             s->avctx->sample_aspect_ratio.den = 1;
3791
3792         if(h->sps.timing_info_present_flag){
3793             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3794             if(h->x264_build > 0 && h->x264_build < 44)
3795                 s->avctx->time_base.den *= 2;
3796             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3797                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3798         }
3799     }
3800
3801     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3802
3803     h->mb_mbaff = 0;
3804     h->mb_aff_frame = 0;
3805     last_pic_structure = s0->picture_structure;
3806     if(h->sps.frame_mbs_only_flag){
3807         s->picture_structure= PICT_FRAME;
3808     }else{
3809         if(get_bits1(&s->gb)) { //field_pic_flag
3810             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3811         } else {
3812             s->picture_structure= PICT_FRAME;
3813             h->mb_aff_frame = h->sps.mb_aff;
3814         }
3815     }
3816     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3817
3818     if(h0->current_slice == 0){
3819         while(h->frame_num !=  h->prev_frame_num &&
3820               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3821             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3822             frame_start(h);
3823             h->prev_frame_num++;
3824             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3825             s->current_picture_ptr->frame_num= h->prev_frame_num;
3826             execute_ref_pic_marking(h, NULL, 0);
3827         }
3828
3829         /* See if we have a decoded first field looking for a pair... */
3830         if (s0->first_field) {
3831             assert(s0->current_picture_ptr);
3832             assert(s0->current_picture_ptr->data[0]);
3833             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3834
3835             /* figure out if we have a complementary field pair */
3836             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3837                 /*
3838                  * Previous field is unmatched. Don't display it, but let it
3839                  * remain for reference if marked as such.
3840                  */
3841                 s0->current_picture_ptr = NULL;
3842                 s0->first_field = FIELD_PICTURE;
3843
3844             } else {
3845                 if (h->nal_ref_idc &&
3846                         s0->current_picture_ptr->reference &&
3847                         s0->current_picture_ptr->frame_num != h->frame_num) {
3848                     /*
3849                      * This and previous field were reference, but had
3850                      * different frame_nums. Consider this field first in
3851                      * pair. Throw away previous field except for reference
3852                      * purposes.
3853                      */
3854                     s0->first_field = 1;
3855                     s0->current_picture_ptr = NULL;
3856
3857                 } else {
3858                     /* Second field in complementary pair */
3859                     s0->first_field = 0;
3860                 }
3861             }
3862
3863         } else {
3864             /* Frame or first field in a potentially complementary pair */
3865             assert(!s0->current_picture_ptr);
3866             s0->first_field = FIELD_PICTURE;
3867         }
3868
3869         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3870             s0->first_field = 0;
3871             return -1;
3872         }
3873     }
3874     if(h != h0)
3875         clone_slice(h, h0);
3876
3877     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3878
3879     assert(s->mb_num == s->mb_width * s->mb_height);
3880     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3881        first_mb_in_slice                    >= s->mb_num){
3882         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3883         return -1;
3884     }
3885     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3886     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3887     if (s->picture_structure == PICT_BOTTOM_FIELD)
3888         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3889     assert(s->mb_y < s->mb_height);
3890
3891     if(s->picture_structure==PICT_FRAME){
3892         h->curr_pic_num=   h->frame_num;
3893         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3894     }else{
3895         h->curr_pic_num= 2*h->frame_num + 1;
3896         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3897     }
3898
3899     if(h->nal_unit_type == NAL_IDR_SLICE){
3900         get_ue_golomb(&s->gb); /* idr_pic_id */
3901     }
3902
3903     if(h->sps.poc_type==0){
3904         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3905
3906         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3907             h->delta_poc_bottom= get_se_golomb(&s->gb);
3908         }
3909     }
3910
3911     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3912         h->delta_poc[0]= get_se_golomb(&s->gb);
3913
3914         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3915             h->delta_poc[1]= get_se_golomb(&s->gb);
3916     }
3917
3918     init_poc(h);
3919
3920     if(h->pps.redundant_pic_cnt_present){
3921         h->redundant_pic_count= get_ue_golomb(&s->gb);
3922     }
3923
3924     //set defaults, might be overridden a few lines later
3925     h->ref_count[0]= h->pps.ref_count[0];
3926     h->ref_count[1]= h->pps.ref_count[1];
3927
3928     if(h->slice_type_nos != FF_I_TYPE){
3929         if(h->slice_type_nos == FF_B_TYPE){
3930             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3931         }
3932         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3933
3934         if(num_ref_idx_active_override_flag){
3935             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3936             if(h->slice_type_nos==FF_B_TYPE)
3937                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3938
3939             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3940                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3941                 h->ref_count[0]= h->ref_count[1]= 1;
3942                 return -1;
3943             }
3944         }
3945         if(h->slice_type_nos == FF_B_TYPE)
3946             h->list_count= 2;
3947         else
3948             h->list_count= 1;
3949     }else
3950         h->list_count= 0;
3951
3952     if(!default_ref_list_done){
3953         fill_default_ref_list(h);
3954     }
3955
3956     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3957         return -1;
3958
3959     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3960        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3961         pred_weight_table(h);
3962     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3963         implicit_weight_table(h);
3964     else
3965         h->use_weight = 0;
3966
3967     if(h->nal_ref_idc)
3968         decode_ref_pic_marking(h0, &s->gb);
3969
3970     if(FRAME_MBAFF)
3971         fill_mbaff_ref_list(h);
3972
3973     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3974         tmp = get_ue_golomb(&s->gb);
3975         if(tmp > 2){
3976             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3977             return -1;
3978         }
3979         h->cabac_init_idc= tmp;
3980     }
3981
3982     h->last_qscale_diff = 0;
3983     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3984     if(tmp>51){
3985         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3986         return -1;
3987     }
3988     s->qscale= tmp;
3989     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3990     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3991     //FIXME qscale / qp ... stuff
3992     if(h->slice_type == FF_SP_TYPE){
3993         get_bits1(&s->gb); /* sp_for_switch_flag */
3994     }
3995     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3996         get_se_golomb(&s->gb); /* slice_qs_delta */
3997     }
3998
3999     h->deblocking_filter = 1;
4000     h->slice_alpha_c0_offset = 0;
4001     h->slice_beta_offset = 0;
4002     if( h->pps.deblocking_filter_parameters_present ) {
4003         tmp= get_ue_golomb(&s->gb);
4004         if(tmp > 2){
4005             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4006             return -1;
4007         }
4008         h->deblocking_filter= tmp;
4009         if(h->deblocking_filter < 2)
4010             h->deblocking_filter^= 1; // 1<->0
4011
4012         if( h->deblocking_filter ) {
4013             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4014             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4015         }
4016     }
4017
4018     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4019        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4020        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4021        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4022         h->deblocking_filter= 0;
4023
4024     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4025         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4026             /* Cheat slightly for speed:
4027                Do not bother to deblock across slices. */
4028             h->deblocking_filter = 2;
4029         } else {
4030             h0->max_contexts = 1;
4031             if(!h0->single_decode_warning) {
4032                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4033                 h0->single_decode_warning = 1;
4034             }
4035             if(h != h0)
4036                 return 1; // deblocking switched inside frame
4037         }
4038     }
4039
4040 #if 0 //FMO
4041     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4042         slice_group_change_cycle= get_bits(&s->gb, ?);
4043 #endif
4044
4045     h0->last_slice_type = slice_type;
4046     h->slice_num = ++h0->current_slice;
4047
4048     for(j=0; j<2; j++){
4049         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4050         ref2frm[0]=
4051         ref2frm[1]= -1;
4052         for(i=0; i<48; i++)
4053             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4054                           +(h->ref_list[j][i].reference&3);
4055     }
4056
4057     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4058     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4059
4060     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4061         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4062                h->slice_num,
4063                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4064                first_mb_in_slice,
4065                av_get_pict_type_char(h->slice_type),
4066                pps_id, h->frame_num,
4067                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4068                h->ref_count[0], h->ref_count[1],
4069                s->qscale,
4070                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4071                h->use_weight,
4072                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4073                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4074                );
4075     }
4076
4077     return 0;
4078 }
4079
4080 /**
4081  *
4082  */
4083 static inline int get_level_prefix(GetBitContext *gb){
4084     unsigned int buf;
4085     int log;
4086
4087     OPEN_READER(re, gb);
4088     UPDATE_CACHE(re, gb);
4089     buf=GET_CACHE(re, gb);
4090
4091     log= 32 - av_log2(buf);
4092 #ifdef TRACE
4093     print_bin(buf>>(32-log), log);
4094     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4095 #endif
4096
4097     LAST_SKIP_BITS(re, gb, log);
4098     CLOSE_READER(re, gb);
4099
4100     return log-1;
4101 }
4102
4103 static inline int get_dct8x8_allowed(H264Context *h){
4104     int i;
4105     for(i=0; i<4; i++){
4106         if(!IS_SUB_8X8(h->sub_mb_type[i])
4107            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4108             return 0;
4109     }
4110     return 1;
4111 }
4112
4113 /**
4114  * decodes a residual block.
4115  * @param n block index
4116  * @param scantable scantable
4117  * @param max_coeff number of coefficients in the block
4118  * @return <0 if an error occurred
4119  */
4120 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4121     MpegEncContext * const s = &h->s;
4122     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4123     int level[16];
4124     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4125
4126     //FIXME put trailing_onex into the context
4127
4128     if(n == CHROMA_DC_BLOCK_INDEX){
4129         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4130         total_coeff= coeff_token>>2;
4131     }else{
4132         if(n == LUMA_DC_BLOCK_INDEX){
4133             total_coeff= pred_non_zero_count(h, 0);
4134             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4135             total_coeff= coeff_token>>2;
4136         }else{
4137             total_coeff= pred_non_zero_count(h, n);
4138             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4139             total_coeff= coeff_token>>2;
4140             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4141         }
4142     }
4143
4144     //FIXME set last_non_zero?
4145
4146     if(total_coeff==0)
4147         return 0;
4148     if(total_coeff > (unsigned)max_coeff) {
4149         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4150         return -1;
4151     }
4152
4153     trailing_ones= coeff_token&3;
4154     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4155     assert(total_coeff<=16);
4156
4157     for(i=0; i<trailing_ones; i++){
4158         level[i]= 1 - 2*get_bits1(gb);
4159     }
4160
4161     if(i<total_coeff) {
4162         int level_code, mask;
4163         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4164         int prefix= get_level_prefix(gb);
4165
4166         //first coefficient has suffix_length equal to 0 or 1
4167         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4168             if(suffix_length)
4169                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4170             else
4171                 level_code= (prefix<<suffix_length); //part
4172         }else if(prefix==14){
4173             if(suffix_length)
4174                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4175             else
4176                 level_code= prefix + get_bits(gb, 4); //part
4177         }else{
4178             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4179             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4180             if(prefix>=16)
4181                 level_code += (1<<(prefix-3))-4096;
4182         }
4183
4184         if(trailing_ones < 3) level_code += 2;
4185
4186         suffix_length = 1;
4187         if(level_code > 5)
4188             suffix_length++;
4189         mask= -(level_code&1);
4190         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4191         i++;
4192
4193         //remaining coefficients have suffix_length > 0
4194         for(;i<total_coeff;i++) {
4195             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4196             prefix = get_level_prefix(gb);
4197             if(prefix<15){
4198                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4199             }else{
4200                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4201                 if(prefix>=16)
4202                     level_code += (1<<(prefix-3))-4096;
4203             }
4204             mask= -(level_code&1);
4205             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4206             if(level_code > suffix_limit[suffix_length])
4207                 suffix_length++;
4208         }
4209     }
4210
4211     if(total_coeff == max_coeff)
4212         zeros_left=0;
4213     else{
4214         if(n == CHROMA_DC_BLOCK_INDEX)
4215             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4216         else
4217             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4218     }
4219
4220     coeff_num = zeros_left + total_coeff - 1;
4221     j = scantable[coeff_num];
4222     if(n > 24){
4223         block[j] = level[0];
4224         for(i=1;i<total_coeff;i++) {
4225             if(zeros_left <= 0)
4226                 run_before = 0;
4227             else if(zeros_left < 7){
4228                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4229             }else{
4230                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4231             }
4232             zeros_left -= run_before;
4233             coeff_num -= 1 + run_before;
4234             j= scantable[ coeff_num ];
4235
4236             block[j]= level[i];
4237         }
4238     }else{
4239         block[j] = (level[0] * qmul[j] + 32)>>6;
4240         for(i=1;i<total_coeff;i++) {
4241             if(zeros_left <= 0)
4242                 run_before = 0;
4243             else if(zeros_left < 7){
4244                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4245             }else{
4246                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4247             }
4248             zeros_left -= run_before;
4249             coeff_num -= 1 + run_before;
4250             j= scantable[ coeff_num ];
4251
4252             block[j]= (level[i] * qmul[j] + 32)>>6;
4253         }
4254     }
4255
4256     if(zeros_left<0){
4257         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4258         return -1;
4259     }
4260
4261     return 0;
4262 }
4263
4264 static void predict_field_decoding_flag(H264Context *h){
4265     MpegEncContext * const s = &h->s;
4266     const int mb_xy= h->mb_xy;
4267     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4268                 ? s->current_picture.mb_type[mb_xy-1]
4269                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4270                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4271                 : 0;
4272     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4273 }
4274
4275 /**
4276  * decodes a P_SKIP or B_SKIP macroblock
4277  */
4278 static void decode_mb_skip(H264Context *h){
4279     MpegEncContext * const s = &h->s;
4280     const int mb_xy= h->mb_xy;
4281     int mb_type=0;
4282
4283     memset(h->non_zero_count[mb_xy], 0, 16);
4284     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4285
4286     if(MB_FIELD)
4287         mb_type|= MB_TYPE_INTERLACED;
4288
4289     if( h->slice_type_nos == FF_B_TYPE )
4290     {
4291         // just for fill_caches. pred_direct_motion will set the real mb_type
4292         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4293
4294         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4295         pred_direct_motion(h, &mb_type);
4296         mb_type|= MB_TYPE_SKIP;
4297     }
4298     else
4299     {
4300         int mx, my;
4301         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4302
4303         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4304         pred_pskip_motion(h, &mx, &my);
4305         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4306         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4307     }
4308
4309     write_back_motion(h, mb_type);
4310     s->current_picture.mb_type[mb_xy]= mb_type;
4311     s->current_picture.qscale_table[mb_xy]= s->qscale;
4312     h->slice_table[ mb_xy ]= h->slice_num;
4313     h->prev_mb_skipped= 1;
4314 }
4315
4316 /**
4317  * decodes a macroblock
4318  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4319  */
4320 static int decode_mb_cavlc(H264Context *h){
4321     MpegEncContext * const s = &h->s;
4322     int mb_xy;
4323     int partition_count;
4324     unsigned int mb_type, cbp;
4325     int dct8x8_allowed= h->pps.transform_8x8_mode;
4326
4327     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4328
4329     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4330
4331     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4332     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4333                 down the code */
4334     if(h->slice_type_nos != FF_I_TYPE){
4335         if(s->mb_skip_run==-1)
4336             s->mb_skip_run= get_ue_golomb(&s->gb);
4337
4338         if (s->mb_skip_run--) {
4339             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4340                 if(s->mb_skip_run==0)
4341                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4342                 else
4343                     predict_field_decoding_flag(h);
4344             }
4345             decode_mb_skip(h);
4346             return 0;
4347         }
4348     }
4349     if(FRAME_MBAFF){
4350         if( (s->mb_y&1) == 0 )
4351             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4352     }
4353
4354     h->prev_mb_skipped= 0;
4355
4356     mb_type= get_ue_golomb(&s->gb);
4357     if(h->slice_type_nos == FF_B_TYPE){
4358         if(mb_type < 23){
4359             partition_count= b_mb_type_info[mb_type].partition_count;
4360             mb_type=         b_mb_type_info[mb_type].type;
4361         }else{
4362             mb_type -= 23;
4363             goto decode_intra_mb;
4364         }
4365     }else if(h->slice_type_nos == FF_P_TYPE){
4366         if(mb_type < 5){
4367             partition_count= p_mb_type_info[mb_type].partition_count;
4368             mb_type=         p_mb_type_info[mb_type].type;
4369         }else{
4370             mb_type -= 5;
4371             goto decode_intra_mb;
4372         }
4373     }else{
4374        assert(h->slice_type_nos == FF_I_TYPE);
4375         if(h->slice_type == FF_SI_TYPE && mb_type)
4376             mb_type--;
4377 decode_intra_mb:
4378         if(mb_type > 25){
4379             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4380             return -1;
4381         }
4382         partition_count=0;
4383         cbp= i_mb_type_info[mb_type].cbp;
4384         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4385         mb_type= i_mb_type_info[mb_type].type;
4386     }
4387
4388     if(MB_FIELD)
4389         mb_type |= MB_TYPE_INTERLACED;
4390
4391     h->slice_table[ mb_xy ]= h->slice_num;
4392
4393     if(IS_INTRA_PCM(mb_type)){
4394         unsigned int x;
4395
4396         // We assume these blocks are very rare so we do not optimize it.
4397         align_get_bits(&s->gb);
4398
4399         // The pixels are stored in the same order as levels in h->mb array.
4400         for(x=0; x < (CHROMA ? 384 : 256); x++){
4401             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4402         }
4403
4404         // In deblocking, the quantizer is 0
4405         s->current_picture.qscale_table[mb_xy]= 0;
4406         // All coeffs are present
4407         memset(h->non_zero_count[mb_xy], 16, 16);
4408
4409         s->current_picture.mb_type[mb_xy]= mb_type;
4410         return 0;
4411     }
4412
4413     if(MB_MBAFF){
4414         h->ref_count[0] <<= 1;
4415         h->ref_count[1] <<= 1;
4416     }
4417
4418     fill_caches(h, mb_type, 0);
4419
4420     //mb_pred
4421     if(IS_INTRA(mb_type)){
4422         int pred_mode;
4423 //            init_top_left_availability(h);
4424         if(IS_INTRA4x4(mb_type)){
4425             int i;
4426             int di = 1;
4427             if(dct8x8_allowed && get_bits1(&s->gb)){
4428                 mb_type |= MB_TYPE_8x8DCT;
4429                 di = 4;
4430             }
4431
4432 //                fill_intra4x4_pred_table(h);
4433             for(i=0; i<16; i+=di){
4434                 int mode= pred_intra_mode(h, i);
4435
4436                 if(!get_bits1(&s->gb)){
4437                     const int rem_mode= get_bits(&s->gb, 3);
4438                     mode = rem_mode + (rem_mode >= mode);
4439                 }
4440
4441                 if(di==4)
4442                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4443                 else
4444                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4445             }
4446             write_back_intra_pred_mode(h);
4447             if( check_intra4x4_pred_mode(h) < 0)
4448                 return -1;
4449         }else{
4450             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4451             if(h->intra16x16_pred_mode < 0)
4452                 return -1;
4453         }
4454         if(CHROMA){
4455             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4456             if(pred_mode < 0)
4457                 return -1;
4458             h->chroma_pred_mode= pred_mode;
4459         }
4460     }else if(partition_count==4){
4461         int i, j, sub_partition_count[4], list, ref[2][4];
4462
4463         if(h->slice_type_nos == FF_B_TYPE){
4464             for(i=0; i<4; i++){
4465                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4466                 if(h->sub_mb_type[i] >=13){
4467                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4468                     return -1;
4469                 }
4470                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4471                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4472             }
4473             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4474                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4475                 pred_direct_motion(h, &mb_type);
4476                 h->ref_cache[0][scan8[4]] =
4477                 h->ref_cache[1][scan8[4]] =
4478                 h->ref_cache[0][scan8[12]] =
4479                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4480             }
4481         }else{
4482             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4483             for(i=0; i<4; i++){
4484                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4485                 if(h->sub_mb_type[i] >=4){
4486                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4487                     return -1;
4488                 }
4489                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4490                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4491             }
4492         }
4493
4494         for(list=0; list<h->list_count; list++){
4495             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4496             for(i=0; i<4; i++){
4497                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4498                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4499                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4500                     if(tmp>=ref_count){
4501                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4502                         return -1;
4503                     }
4504                     ref[list][i]= tmp;
4505                 }else{
4506                  //FIXME
4507                     ref[list][i] = -1;
4508                 }
4509             }
4510         }
4511
4512         if(dct8x8_allowed)
4513             dct8x8_allowed = get_dct8x8_allowed(h);
4514
4515         for(list=0; list<h->list_count; list++){
4516             for(i=0; i<4; i++){
4517                 if(IS_DIRECT(h->sub_mb_type[i])) {
4518                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4519                     continue;
4520                 }
4521                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4522                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4523
4524                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4525                     const int sub_mb_type= h->sub_mb_type[i];
4526                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4527                     for(j=0; j<sub_partition_count[i]; j++){
4528                         int mx, my;
4529                         const int index= 4*i + block_width*j;
4530                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4531                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4532                         mx += get_se_golomb(&s->gb);
4533                         my += get_se_golomb(&s->gb);
4534                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4535
4536                         if(IS_SUB_8X8(sub_mb_type)){
4537                             mv_cache[ 1 ][0]=
4538                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4539                             mv_cache[ 1 ][1]=
4540                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4541                         }else if(IS_SUB_8X4(sub_mb_type)){
4542                             mv_cache[ 1 ][0]= mx;
4543                             mv_cache[ 1 ][1]= my;
4544                         }else if(IS_SUB_4X8(sub_mb_type)){
4545                             mv_cache[ 8 ][0]= mx;
4546                             mv_cache[ 8 ][1]= my;
4547                         }
4548                         mv_cache[ 0 ][0]= mx;
4549                         mv_cache[ 0 ][1]= my;
4550                     }
4551                 }else{
4552                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4553                     p[0] = p[1]=
4554                     p[8] = p[9]= 0;
4555                 }
4556             }
4557         }
4558     }else if(IS_DIRECT(mb_type)){
4559         pred_direct_motion(h, &mb_type);
4560         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4561     }else{
4562         int list, mx, my, i;
4563          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4564         if(IS_16X16(mb_type)){
4565             for(list=0; list<h->list_count; list++){
4566                     unsigned int val;
4567                     if(IS_DIR(mb_type, 0, list)){
4568                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4569                         if(val >= h->ref_count[list]){
4570                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4571                             return -1;
4572                         }
4573                     }else
4574                         val= LIST_NOT_USED&0xFF;
4575                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4576             }
4577             for(list=0; list<h->list_count; list++){
4578                 unsigned int val;
4579                 if(IS_DIR(mb_type, 0, list)){
4580                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4581                     mx += get_se_golomb(&s->gb);
4582                     my += get_se_golomb(&s->gb);
4583                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4584
4585                     val= pack16to32(mx,my);
4586                 }else
4587                     val=0;
4588                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4589             }
4590         }
4591         else if(IS_16X8(mb_type)){
4592             for(list=0; list<h->list_count; list++){
4593                     for(i=0; i<2; i++){
4594                         unsigned int val;
4595                         if(IS_DIR(mb_type, i, list)){
4596                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4597                             if(val >= h->ref_count[list]){
4598                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4599                                 return -1;
4600                             }
4601                         }else
4602                             val= LIST_NOT_USED&0xFF;
4603                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4604                     }
4605             }
4606             for(list=0; list<h->list_count; list++){
4607                 for(i=0; i<2; i++){
4608                     unsigned int val;
4609                     if(IS_DIR(mb_type, i, list)){
4610                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4611                         mx += get_se_golomb(&s->gb);
4612                         my += get_se_golomb(&s->gb);
4613                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4614
4615                         val= pack16to32(mx,my);
4616                     }else
4617                         val=0;
4618                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4619                 }
4620             }
4621         }else{
4622             assert(IS_8X16(mb_type));
4623             for(list=0; list<h->list_count; list++){
4624                     for(i=0; i<2; i++){
4625                         unsigned int val;
4626                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4627                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4628                             if(val >= h->ref_count[list]){
4629                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4630                                 return -1;
4631                             }
4632                         }else
4633                             val= LIST_NOT_USED&0xFF;
4634                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4635                     }
4636             }
4637             for(list=0; list<h->list_count; list++){
4638                 for(i=0; i<2; i++){
4639                     unsigned int val;
4640                     if(IS_DIR(mb_type, i, list)){
4641                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4642                         mx += get_se_golomb(&s->gb);
4643                         my += get_se_golomb(&s->gb);
4644                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4645
4646                         val= pack16to32(mx,my);
4647                     }else
4648                         val=0;
4649                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4650                 }
4651             }
4652         }
4653     }
4654
4655     if(IS_INTER(mb_type))
4656         write_back_motion(h, mb_type);
4657
4658     if(!IS_INTRA16x16(mb_type)){
4659         cbp= get_ue_golomb(&s->gb);
4660         if(cbp > 47){
4661             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4662             return -1;
4663         }
4664
4665         if(CHROMA){
4666             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4667             else                     cbp= golomb_to_inter_cbp   [cbp];
4668         }else{
4669             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4670             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4671         }
4672     }
4673     h->cbp = cbp;
4674
4675     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4676         if(get_bits1(&s->gb)){
4677             mb_type |= MB_TYPE_8x8DCT;
4678             h->cbp_table[mb_xy]= cbp;
4679         }
4680     }
4681     s->current_picture.mb_type[mb_xy]= mb_type;
4682
4683     if(cbp || IS_INTRA16x16(mb_type)){
4684         int i8x8, i4x4, chroma_idx;
4685         int dquant;
4686         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4687         const uint8_t *scan, *scan8x8, *dc_scan;
4688
4689 //        fill_non_zero_count_cache(h);
4690
4691         if(IS_INTERLACED(mb_type)){
4692             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4693             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4694             dc_scan= luma_dc_field_scan;
4695         }else{
4696             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4697             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4698             dc_scan= luma_dc_zigzag_scan;
4699         }
4700
4701         dquant= get_se_golomb(&s->gb);
4702
4703         if( dquant > 25 || dquant < -26 ){
4704             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4705             return -1;
4706         }
4707
4708         s->qscale += dquant;
4709         if(((unsigned)s->qscale) > 51){
4710             if(s->qscale<0) s->qscale+= 52;
4711             else            s->qscale-= 52;
4712         }
4713
4714         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4715         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4716         if(IS_INTRA16x16(mb_type)){
4717             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4718                 return -1; //FIXME continue if partitioned and other return -1 too
4719             }
4720
4721             assert((cbp&15) == 0 || (cbp&15) == 15);
4722
4723             if(cbp&15){
4724                 for(i8x8=0; i8x8<4; i8x8++){
4725                     for(i4x4=0; i4x4<4; i4x4++){
4726                         const int index= i4x4 + 4*i8x8;
4727                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4728                             return -1;
4729                         }
4730                     }
4731                 }
4732             }else{
4733                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4734             }
4735         }else{
4736             for(i8x8=0; i8x8<4; i8x8++){
4737                 if(cbp & (1<<i8x8)){
4738                     if(IS_8x8DCT(mb_type)){
4739                         DCTELEM *buf = &h->mb[64*i8x8];
4740                         uint8_t *nnz;
4741                         for(i4x4=0; i4x4<4; i4x4++){
4742                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4743                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4744                                 return -1;
4745                         }
4746                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4747                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4748                     }else{
4749                         for(i4x4=0; i4x4<4; i4x4++){
4750                             const int index= i4x4 + 4*i8x8;
4751
4752                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4753                                 return -1;
4754                             }
4755                         }
4756                     }
4757                 }else{
4758                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4759                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4760                 }
4761             }
4762         }
4763
4764         if(cbp&0x30){
4765             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4766                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4767                     return -1;
4768                 }
4769         }
4770
4771         if(cbp&0x20){
4772             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4773                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4774                 for(i4x4=0; i4x4<4; i4x4++){
4775                     const int index= 16 + 4*chroma_idx + i4x4;
4776                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4777                         return -1;
4778                     }
4779                 }
4780             }
4781         }else{
4782             uint8_t * const nnz= &h->non_zero_count_cache[0];
4783             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4784             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4785         }
4786     }else{
4787         uint8_t * const nnz= &h->non_zero_count_cache[0];
4788         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4789         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4790         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4791     }
4792     s->current_picture.qscale_table[mb_xy]= s->qscale;
4793     write_back_non_zero_count(h);
4794
4795     if(MB_MBAFF){
4796         h->ref_count[0] >>= 1;
4797         h->ref_count[1] >>= 1;
4798     }
4799
4800     return 0;
4801 }
4802
4803 static int decode_cabac_field_decoding_flag(H264Context *h) {
4804     MpegEncContext * const s = &h->s;
4805     const int mb_x = s->mb_x;
4806     const int mb_y = s->mb_y & ~1;
4807     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4808     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4809
4810     unsigned int ctx = 0;
4811
4812     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4813         ctx += 1;
4814     }
4815     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4816         ctx += 1;
4817     }
4818
4819     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4820 }
4821
4822 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4823     uint8_t *state= &h->cabac_state[ctx_base];
4824     int mb_type;
4825
4826     if(intra_slice){
4827         MpegEncContext * const s = &h->s;
4828         const int mba_xy = h->left_mb_xy[0];
4829         const int mbb_xy = h->top_mb_xy;
4830         int ctx=0;
4831         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4832             ctx++;
4833         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4834             ctx++;
4835         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4836             return 0;   /* I4x4 */
4837         state += 2;
4838     }else{
4839         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4840             return 0;   /* I4x4 */
4841     }
4842
4843     if( get_cabac_terminate( &h->cabac ) )
4844         return 25;  /* PCM */
4845
4846     mb_type = 1; /* I16x16 */
4847     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4848     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4849         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4850     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4851     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4852     return mb_type;
4853 }
4854
4855 static int decode_cabac_mb_type( H264Context *h ) {
4856     MpegEncContext * const s = &h->s;
4857
4858     if( h->slice_type_nos == FF_I_TYPE ) {
4859         return decode_cabac_intra_mb_type(h, 3, 1);
4860     } else if( h->slice_type_nos == FF_P_TYPE ) {
4861         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4862             /* P-type */
4863             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4864                 /* P_L0_D16x16, P_8x8 */
4865                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4866             } else {
4867                 /* P_L0_D8x16, P_L0_D16x8 */
4868                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4869             }
4870         } else {
4871             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4872         }
4873     } else if( h->slice_type_nos == FF_B_TYPE ) {
4874         const int mba_xy = h->left_mb_xy[0];
4875         const int mbb_xy = h->top_mb_xy;
4876         int ctx = 0;
4877         int bits;
4878
4879         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4880             ctx++;
4881         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4882             ctx++;
4883
4884         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4885             return 0; /* B_Direct_16x16 */
4886
4887         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4888             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4889         }
4890
4891         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4892         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4893         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4894         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4895         if( bits < 8 )
4896             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4897         else if( bits == 13 ) {
4898             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4899         } else if( bits == 14 )
4900             return 11; /* B_L1_L0_8x16 */
4901         else if( bits == 15 )
4902             return 22; /* B_8x8 */
4903
4904         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4905         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4906     } else {
4907         /* TODO SI/SP frames? */
4908         return -1;
4909     }
4910 }
4911
4912 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4913     MpegEncContext * const s = &h->s;
4914     int mba_xy, mbb_xy;
4915     int ctx = 0;
4916
4917     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4918         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4919         mba_xy = mb_xy - 1;
4920         if( (mb_y&1)
4921             && h->slice_table[mba_xy] == h->slice_num
4922             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4923             mba_xy += s->mb_stride;
4924         if( MB_FIELD ){
4925             mbb_xy = mb_xy - s->mb_stride;
4926             if( !(mb_y&1)
4927                 && h->slice_table[mbb_xy] == h->slice_num
4928                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4929                 mbb_xy -= s->mb_stride;
4930         }else
4931             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4932     }else{
4933         int mb_xy = h->mb_xy;
4934         mba_xy = mb_xy - 1;
4935         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4936     }
4937
4938     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4939         ctx++;
4940     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4941         ctx++;
4942
4943     if( h->slice_type_nos == FF_B_TYPE )
4944         ctx += 13;
4945     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4946 }
4947
4948 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4949     int mode = 0;
4950
4951     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4952         return pred_mode;
4953
4954     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4955     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4956     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4957
4958     if( mode >= pred_mode )
4959         return mode + 1;
4960     else
4961         return mode;
4962 }
4963
4964 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4965     const int mba_xy = h->left_mb_xy[0];
4966     const int mbb_xy = h->top_mb_xy;
4967
4968     int ctx = 0;
4969
4970     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4971     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4972         ctx++;
4973
4974     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4975         ctx++;
4976
4977     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4978         return 0;
4979
4980     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4981         return 1;
4982     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4983         return 2;
4984     else
4985         return 3;
4986 }
4987
4988 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4989     int cbp_b, cbp_a, ctx, cbp = 0;
4990
4991     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4992     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4993
4994     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4995     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4996     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4997     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4998     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4999     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5000     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5001     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5002     return cbp;
5003 }
5004 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5005     int ctx;
5006     int cbp_a, cbp_b;
5007
5008     cbp_a = (h->left_cbp>>4)&0x03;
5009     cbp_b = (h-> top_cbp>>4)&0x03;
5010
5011     ctx = 0;
5012     if( cbp_a > 0 ) ctx++;
5013     if( cbp_b > 0 ) ctx += 2;
5014     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5015         return 0;
5016
5017     ctx = 4;
5018     if( cbp_a == 2 ) ctx++;
5019     if( cbp_b == 2 ) ctx += 2;
5020     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5021 }
5022 static int decode_cabac_mb_dqp( H264Context *h) {
5023     int   ctx = 0;
5024     int   val = 0;
5025
5026     if( h->last_qscale_diff != 0 )
5027         ctx++;
5028
5029     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5030         if( ctx < 2 )
5031             ctx = 2;
5032         else
5033             ctx = 3;
5034         val++;
5035         if(val > 102) //prevent infinite loop
5036             return INT_MIN;
5037     }
5038
5039     if( val&0x01 )
5040         return (val + 1)/2;
5041     else
5042         return -(val + 1)/2;
5043 }
5044 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5045     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5046         return 0;   /* 8x8 */
5047     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5048         return 1;   /* 8x4 */
5049     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5050         return 2;   /* 4x8 */
5051     return 3;       /* 4x4 */
5052 }
5053 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5054     int type;
5055     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5056         return 0;   /* B_Direct_8x8 */
5057     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5058         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5059     type = 3;
5060     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5061         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5062             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5063         type += 4;
5064     }
5065     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5066     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5067     return type;
5068 }
5069
5070 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5071     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5072 }
5073
5074 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5075     int refa = h->ref_cache[list][scan8[n] - 1];
5076     int refb = h->ref_cache[list][scan8[n] - 8];
5077     int ref  = 0;
5078     int ctx  = 0;
5079
5080     if( h->slice_type_nos == FF_B_TYPE) {
5081         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5082             ctx++;
5083         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5084             ctx += 2;
5085     } else {
5086         if( refa > 0 )
5087             ctx++;
5088         if( refb > 0 )
5089             ctx += 2;
5090     }
5091
5092     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5093         ref++;
5094         if( ctx < 4 )
5095             ctx = 4;
5096         else
5097             ctx = 5;
5098         if(ref >= 32 /*h->ref_list[list]*/){
5099             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5100             return 0; //FIXME we should return -1 and check the return everywhere
5101         }
5102     }
5103     return ref;
5104 }
5105
5106 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5107     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5108                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5109     int ctxbase = (l == 0) ? 40 : 47;
5110     int ctx, mvd;
5111
5112     if( amvd < 3 )
5113         ctx = 0;
5114     else if( amvd > 32 )
5115         ctx = 2;
5116     else
5117         ctx = 1;
5118
5119     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5120         return 0;
5121
5122     mvd= 1;
5123     ctx= 3;
5124     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5125         mvd++;
5126         if( ctx < 6 )
5127             ctx++;
5128     }
5129
5130     if( mvd >= 9 ) {
5131         int k = 3;
5132         while( get_cabac_bypass( &h->cabac ) ) {
5133             mvd += 1 << k;
5134             k++;
5135             if(k>24){
5136                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5137                 return INT_MIN;
5138             }
5139         }
5140         while( k-- ) {
5141             if( get_cabac_bypass( &h->cabac ) )
5142                 mvd += 1 << k;
5143         }
5144     }
5145     return get_cabac_bypass_sign( &h->cabac, -mvd );
5146 }
5147
5148 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5149     int nza, nzb;
5150     int ctx = 0;
5151
5152     if( is_dc ) {
5153         if( cat == 0 ) {
5154             nza = h->left_cbp&0x100;
5155             nzb = h-> top_cbp&0x100;
5156         } else {
5157             nza = (h->left_cbp>>(6+idx))&0x01;
5158             nzb = (h-> top_cbp>>(6+idx))&0x01;
5159         }
5160     } else {
5161         if( cat == 4 ) {
5162             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5163             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5164         } else {
5165             assert(cat == 1 || cat == 2);
5166             nza = h->non_zero_count_cache[scan8[idx] - 1];
5167             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5168         }
5169     }
5170
5171     if( nza > 0 )
5172         ctx++;
5173
5174     if( nzb > 0 )
5175         ctx += 2;
5176
5177     return ctx + 4 * cat;
5178 }
5179
5180 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5181     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5182     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5183     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5184     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5185 };
5186
5187 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5188     static const int significant_coeff_flag_offset[2][6] = {
5189       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5190       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5191     };
5192     static const int last_coeff_flag_offset[2][6] = {
5193       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5194       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5195     };
5196     static const int coeff_abs_level_m1_offset[6] = {
5197         227+0, 227+10, 227+20, 227+30, 227+39, 426
5198     };
5199     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5200       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5201         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5202         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5203        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5204       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5205         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5206         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5207         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5208     };
5209     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5210      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5211      * map node ctx => cabac ctx for level=1 */
5212     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5213     /* map node ctx => cabac ctx for level>1 */
5214     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5215     static const uint8_t coeff_abs_level_transition[2][8] = {
5216     /* update node ctx after decoding a level=1 */
5217         { 1, 2, 3, 3, 4, 5, 6, 7 },
5218     /* update node ctx after decoding a level>1 */
5219         { 4, 4, 4, 4, 5, 6, 7, 7 }
5220     };
5221
5222     int index[64];
5223
5224     int av_unused last;
5225     int coeff_count = 0;
5226     int node_ctx = 0;
5227
5228     uint8_t *significant_coeff_ctx_base;
5229     uint8_t *last_coeff_ctx_base;
5230     uint8_t *abs_level_m1_ctx_base;
5231
5232 #ifndef ARCH_X86
5233 #define CABAC_ON_STACK
5234 #endif
5235 #ifdef CABAC_ON_STACK
5236 #define CC &cc
5237     CABACContext cc;
5238     cc.range     = h->cabac.range;
5239     cc.low       = h->cabac.low;
5240     cc.bytestream= h->cabac.bytestream;
5241 #else
5242 #define CC &h->cabac
5243 #endif
5244
5245
5246     /* cat: 0-> DC 16x16  n = 0
5247      *      1-> AC 16x16  n = luma4x4idx
5248      *      2-> Luma4x4   n = luma4x4idx
5249      *      3-> DC Chroma n = iCbCr
5250      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5251      *      5-> Luma8x8   n = 4 * luma8x8idx
5252      */
5253
5254     /* read coded block flag */
5255     if( is_dc || cat != 5 ) {
5256         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5257             if( !is_dc ) {
5258                 if( cat == 4 )
5259                     h->non_zero_count_cache[scan8[16+n]] = 0;
5260                 else
5261                     h->non_zero_count_cache[scan8[n]] = 0;
5262             }
5263
5264 #ifdef CABAC_ON_STACK
5265             h->cabac.range     = cc.range     ;
5266             h->cabac.low       = cc.low       ;
5267             h->cabac.bytestream= cc.bytestream;
5268 #endif
5269             return;
5270         }
5271     }
5272
5273     significant_coeff_ctx_base = h->cabac_state
5274         + significant_coeff_flag_offset[MB_FIELD][cat];
5275     last_coeff_ctx_base = h->cabac_state
5276         + last_coeff_flag_offset[MB_FIELD][cat];
5277     abs_level_m1_ctx_base = h->cabac_state
5278         + coeff_abs_level_m1_offset[cat];
5279
5280     if( !is_dc && cat == 5 ) {
5281 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5282         for(last= 0; last < coefs; last++) { \
5283             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5284             if( get_cabac( CC, sig_ctx )) { \
5285                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5286                 index[coeff_count++] = last; \
5287                 if( get_cabac( CC, last_ctx ) ) { \
5288                     last= max_coeff; \
5289                     break; \
5290                 } \
5291             } \
5292         }\
5293         if( last == max_coeff -1 ) {\
5294             index[coeff_count++] = last;\
5295         }
5296         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5297 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5298         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5299     } else {
5300         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5301 #else
5302         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5303     } else {
5304         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5305 #endif
5306     }
5307     assert(coeff_count > 0);
5308
5309     if( is_dc ) {
5310         if( cat == 0 )
5311             h->cbp_table[h->mb_xy] |= 0x100;
5312         else
5313             h->cbp_table[h->mb_xy] |= 0x40 << n;
5314     } else {
5315         if( cat == 5 )
5316             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5317         else if( cat == 4 )
5318             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5319         else {
5320             assert( cat == 1 || cat == 2 );
5321             h->non_zero_count_cache[scan8[n]] = coeff_count;
5322         }
5323     }
5324
5325     do {
5326         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5327
5328         int j= scantable[index[--coeff_count]];
5329
5330         if( get_cabac( CC, ctx ) == 0 ) {
5331             node_ctx = coeff_abs_level_transition[0][node_ctx];
5332             if( is_dc ) {
5333                 block[j] = get_cabac_bypass_sign( CC, -1);
5334             }else{
5335                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5336             }
5337         } else {
5338             int coeff_abs = 2;
5339             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5340             node_ctx = coeff_abs_level_transition[1][node_ctx];
5341
5342             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5343                 coeff_abs++;
5344             }
5345
5346             if( coeff_abs >= 15 ) {
5347                 int j = 0;
5348                 while( get_cabac_bypass( CC ) ) {
5349                     j++;
5350                 }
5351
5352                 coeff_abs=1;
5353                 while( j-- ) {
5354                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5355                 }
5356                 coeff_abs+= 14;
5357             }
5358
5359             if( is_dc ) {
5360                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5361             }else{
5362                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5363             }
5364         }
5365     } while( coeff_count );
5366 #ifdef CABAC_ON_STACK
5367             h->cabac.range     = cc.range     ;
5368             h->cabac.low       = cc.low       ;
5369             h->cabac.bytestream= cc.bytestream;
5370 #endif
5371
5372 }
5373
5374 #ifndef CONFIG_SMALL
5375 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5376     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5377 }
5378
5379 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5380     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5381 }
5382 #endif
5383
5384 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5385 #ifdef CONFIG_SMALL
5386     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5387 #else
5388     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5389     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5390 #endif
5391 }
5392
5393 static inline void compute_mb_neighbors(H264Context *h)
5394 {
5395     MpegEncContext * const s = &h->s;
5396     const int mb_xy  = h->mb_xy;
5397     h->top_mb_xy     = mb_xy - s->mb_stride;
5398     h->left_mb_xy[0] = mb_xy - 1;
5399     if(FRAME_MBAFF){
5400         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5401         const int top_pair_xy      = pair_xy     - s->mb_stride;
5402         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5403         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5404         const int curr_mb_frame_flag = !MB_FIELD;
5405         const int bottom = (s->mb_y & 1);
5406         if (bottom
5407                 ? !curr_mb_frame_flag // bottom macroblock
5408                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5409                 ) {
5410             h->top_mb_xy -= s->mb_stride;
5411         }
5412         if (left_mb_frame_flag != curr_mb_frame_flag) {
5413             h->left_mb_xy[0] = pair_xy - 1;
5414         }
5415     } else if (FIELD_PICTURE) {
5416         h->top_mb_xy -= s->mb_stride;
5417     }
5418     return;
5419 }
5420
5421 /**
5422  * decodes a macroblock
5423  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5424  */
5425 static int decode_mb_cabac(H264Context *h) {
5426     MpegEncContext * const s = &h->s;
5427     int mb_xy;
5428     int mb_type, partition_count, cbp = 0;
5429     int dct8x8_allowed= h->pps.transform_8x8_mode;
5430
5431     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5432
5433     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5434
5435     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5436     if( h->slice_type_nos != FF_I_TYPE ) {
5437         int skip;
5438         /* a skipped mb needs the aff flag from the following mb */
5439         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5440             predict_field_decoding_flag(h);
5441         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5442             skip = h->next_mb_skipped;
5443         else
5444             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5445         /* read skip flags */
5446         if( skip ) {
5447             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5448                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5449                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5450                 if(h->next_mb_skipped)
5451                     predict_field_decoding_flag(h);
5452                 else
5453                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5454             }
5455
5456             decode_mb_skip(h);
5457
5458             h->cbp_table[mb_xy] = 0;
5459             h->chroma_pred_mode_table[mb_xy] = 0;
5460             h->last_qscale_diff = 0;
5461
5462             return 0;
5463
5464         }
5465     }
5466     if(FRAME_MBAFF){
5467         if( (s->mb_y&1) == 0 )
5468             h->mb_mbaff =
5469             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5470     }
5471
5472     h->prev_mb_skipped = 0;
5473
5474     compute_mb_neighbors(h);
5475     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5476         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5477         return -1;
5478     }
5479
5480     if( h->slice_type_nos == FF_B_TYPE ) {
5481         if( mb_type < 23 ){
5482             partition_count= b_mb_type_info[mb_type].partition_count;
5483             mb_type=         b_mb_type_info[mb_type].type;
5484         }else{
5485             mb_type -= 23;
5486             goto decode_intra_mb;
5487         }
5488     } else if( h->slice_type_nos == FF_P_TYPE ) {
5489         if( mb_type < 5) {
5490             partition_count= p_mb_type_info[mb_type].partition_count;
5491             mb_type=         p_mb_type_info[mb_type].type;
5492         } else {
5493             mb_type -= 5;
5494             goto decode_intra_mb;
5495         }
5496     } else {
5497         if(h->slice_type == FF_SI_TYPE && mb_type)
5498             mb_type--;
5499         assert(h->slice_type_nos == FF_I_TYPE);
5500 decode_intra_mb:
5501         partition_count = 0;
5502         cbp= i_mb_type_info[mb_type].cbp;
5503         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5504         mb_type= i_mb_type_info[mb_type].type;
5505     }
5506     if(MB_FIELD)
5507         mb_type |= MB_TYPE_INTERLACED;
5508
5509     h->slice_table[ mb_xy ]= h->slice_num;
5510
5511     if(IS_INTRA_PCM(mb_type)) {
5512         const uint8_t *ptr;
5513
5514         // We assume these blocks are very rare so we do not optimize it.
5515         // FIXME The two following lines get the bitstream position in the cabac
5516         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5517         ptr= h->cabac.bytestream;
5518         if(h->cabac.low&0x1) ptr--;
5519         if(CABAC_BITS==16){
5520             if(h->cabac.low&0x1FF) ptr--;
5521         }
5522
5523         // The pixels are stored in the same order as levels in h->mb array.
5524         memcpy(h->mb, ptr, 256); ptr+=256;
5525         if(CHROMA){
5526             memcpy(h->mb+128, ptr, 128); ptr+=128;
5527         }
5528
5529         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5530
5531         // All blocks are present
5532         h->cbp_table[mb_xy] = 0x1ef;
5533         h->chroma_pred_mode_table[mb_xy] = 0;
5534         // In deblocking, the quantizer is 0
5535         s->current_picture.qscale_table[mb_xy]= 0;
5536         // All coeffs are present
5537         memset(h->non_zero_count[mb_xy], 16, 16);
5538         s->current_picture.mb_type[mb_xy]= mb_type;
5539         h->last_qscale_diff = 0;
5540         return 0;
5541     }
5542
5543     if(MB_MBAFF){
5544         h->ref_count[0] <<= 1;
5545         h->ref_count[1] <<= 1;
5546     }
5547
5548     fill_caches(h, mb_type, 0);
5549
5550     if( IS_INTRA( mb_type ) ) {
5551         int i, pred_mode;
5552         if( IS_INTRA4x4( mb_type ) ) {
5553             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5554                 mb_type |= MB_TYPE_8x8DCT;
5555                 for( i = 0; i < 16; i+=4 ) {
5556                     int pred = pred_intra_mode( h, i );
5557                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5558                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5559                 }
5560             } else {
5561                 for( i = 0; i < 16; i++ ) {
5562                     int pred = pred_intra_mode( h, i );
5563                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5564
5565                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5566                 }
5567             }
5568             write_back_intra_pred_mode(h);
5569             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5570         } else {
5571             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5572             if( h->intra16x16_pred_mode < 0 ) return -1;
5573         }
5574         if(CHROMA){
5575             h->chroma_pred_mode_table[mb_xy] =
5576             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5577
5578             pred_mode= check_intra_pred_mode( h, pred_mode );
5579             if( pred_mode < 0 ) return -1;
5580             h->chroma_pred_mode= pred_mode;
5581         }
5582     } else if( partition_count == 4 ) {
5583         int i, j, sub_partition_count[4], list, ref[2][4];
5584
5585         if( h->slice_type_nos == FF_B_TYPE ) {
5586             for( i = 0; i < 4; i++ ) {
5587                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5588                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5589                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5590             }
5591             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5592                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5593                 pred_direct_motion(h, &mb_type);
5594                 h->ref_cache[0][scan8[4]] =
5595                 h->ref_cache[1][scan8[4]] =
5596                 h->ref_cache[0][scan8[12]] =
5597                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5598                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5599                     for( i = 0; i < 4; i++ )
5600                         if( IS_DIRECT(h->sub_mb_type[i]) )
5601                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5602                 }
5603             }
5604         } else {
5605             for( i = 0; i < 4; i++ ) {
5606                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5607                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5608                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5609             }
5610         }
5611
5612         for( list = 0; list < h->list_count; list++ ) {
5613                 for( i = 0; i < 4; i++ ) {
5614                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5615                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5616                         if( h->ref_count[list] > 1 )
5617                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5618                         else
5619                             ref[list][i] = 0;
5620                     } else {
5621                         ref[list][i] = -1;
5622                     }
5623                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5624                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5625                 }
5626         }
5627
5628         if(dct8x8_allowed)
5629             dct8x8_allowed = get_dct8x8_allowed(h);
5630
5631         for(list=0; list<h->list_count; list++){
5632             for(i=0; i<4; i++){
5633                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5634                 if(IS_DIRECT(h->sub_mb_type[i])){
5635                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5636                     continue;
5637                 }
5638
5639                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5640                     const int sub_mb_type= h->sub_mb_type[i];
5641                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5642                     for(j=0; j<sub_partition_count[i]; j++){
5643                         int mpx, mpy;
5644                         int mx, my;
5645                         const int index= 4*i + block_width*j;
5646                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5647                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5648                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5649
5650                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5651                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5652                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5653
5654                         if(IS_SUB_8X8(sub_mb_type)){
5655                             mv_cache[ 1 ][0]=
5656                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5657                             mv_cache[ 1 ][1]=
5658                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5659
5660                             mvd_cache[ 1 ][0]=
5661                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5662                             mvd_cache[ 1 ][1]=
5663                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5664                         }else if(IS_SUB_8X4(sub_mb_type)){
5665                             mv_cache[ 1 ][0]= mx;
5666                             mv_cache[ 1 ][1]= my;
5667
5668                             mvd_cache[ 1 ][0]= mx - mpx;
5669                             mvd_cache[ 1 ][1]= my - mpy;
5670                         }else if(IS_SUB_4X8(sub_mb_type)){
5671                             mv_cache[ 8 ][0]= mx;
5672                             mv_cache[ 8 ][1]= my;
5673
5674                             mvd_cache[ 8 ][0]= mx - mpx;
5675                             mvd_cache[ 8 ][1]= my - mpy;
5676                         }
5677                         mv_cache[ 0 ][0]= mx;
5678                         mv_cache[ 0 ][1]= my;
5679
5680                         mvd_cache[ 0 ][0]= mx - mpx;
5681                         mvd_cache[ 0 ][1]= my - mpy;
5682                     }
5683                 }else{
5684                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5685                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5686                     p[0] = p[1] = p[8] = p[9] = 0;
5687                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5688                 }
5689             }
5690         }
5691     } else if( IS_DIRECT(mb_type) ) {
5692         pred_direct_motion(h, &mb_type);
5693         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5694         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5695         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5696     } else {
5697         int list, mx, my, i, mpx, mpy;
5698         if(IS_16X16(mb_type)){
5699             for(list=0; list<h->list_count; list++){
5700                 if(IS_DIR(mb_type, 0, list)){
5701                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5702                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5703                 }else
5704                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5705             }
5706             for(list=0; list<h->list_count; list++){
5707                 if(IS_DIR(mb_type, 0, list)){
5708                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5709
5710                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5711                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5712                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5713
5714                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5715                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5716                 }else
5717                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5718             }
5719         }
5720         else if(IS_16X8(mb_type)){
5721             for(list=0; list<h->list_count; list++){
5722                     for(i=0; i<2; i++){
5723                         if(IS_DIR(mb_type, i, list)){
5724                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5725                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5726                         }else
5727                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5728                     }
5729             }
5730             for(list=0; list<h->list_count; list++){
5731                 for(i=0; i<2; i++){
5732                     if(IS_DIR(mb_type, i, list)){
5733                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5734                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5735                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5736                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5737
5738                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5739                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5740                     }else{
5741                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5742                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5743                     }
5744                 }
5745             }
5746         }else{
5747             assert(IS_8X16(mb_type));
5748             for(list=0; list<h->list_count; list++){
5749                     for(i=0; i<2; i++){
5750                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5751                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5752                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5753                         }else
5754                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5755                     }
5756             }
5757             for(list=0; list<h->list_count; list++){
5758                 for(i=0; i<2; i++){
5759                     if(IS_DIR(mb_type, i, list)){
5760                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5761                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5762                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5763
5764                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5765                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5766                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5767                     }else{
5768                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5769                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5770                     }
5771                 }
5772             }
5773         }
5774     }
5775
5776    if( IS_INTER( mb_type ) ) {
5777         h->chroma_pred_mode_table[mb_xy] = 0;
5778         write_back_motion( h, mb_type );
5779    }
5780
5781     if( !IS_INTRA16x16( mb_type ) ) {
5782         cbp  = decode_cabac_mb_cbp_luma( h );
5783         if(CHROMA)
5784             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5785     }
5786
5787     h->cbp_table[mb_xy] = h->cbp = cbp;
5788
5789     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5790         if( decode_cabac_mb_transform_size( h ) )
5791             mb_type |= MB_TYPE_8x8DCT;
5792     }
5793     s->current_picture.mb_type[mb_xy]= mb_type;
5794
5795     if( cbp || IS_INTRA16x16( mb_type ) ) {
5796         const uint8_t *scan, *scan8x8, *dc_scan;
5797         const uint32_t *qmul;
5798         int dqp;
5799
5800         if(IS_INTERLACED(mb_type)){
5801             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5802             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5803             dc_scan= luma_dc_field_scan;
5804         }else{
5805             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5806             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5807             dc_scan= luma_dc_zigzag_scan;
5808         }
5809
5810         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5811         if( dqp == INT_MIN ){
5812             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5813             return -1;
5814         }
5815         s->qscale += dqp;
5816         if(((unsigned)s->qscale) > 51){
5817             if(s->qscale<0) s->qscale+= 52;
5818             else            s->qscale-= 52;
5819         }
5820         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5821         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5822
5823         if( IS_INTRA16x16( mb_type ) ) {
5824             int i;
5825             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5826             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5827
5828             if( cbp&15 ) {
5829                 qmul = h->dequant4_coeff[0][s->qscale];
5830                 for( i = 0; i < 16; i++ ) {
5831                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5832                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5833                 }
5834             } else {
5835                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5836             }
5837         } else {
5838             int i8x8, i4x4;
5839             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5840                 if( cbp & (1<<i8x8) ) {
5841                     if( IS_8x8DCT(mb_type) ) {
5842                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5843                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5844                     } else {
5845                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5846                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5847                             const int index = 4*i8x8 + i4x4;
5848                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5849 //START_TIMER
5850                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5851 //STOP_TIMER("decode_residual")
5852                         }
5853                     }
5854                 } else {
5855                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5856                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5857                 }
5858             }
5859         }
5860
5861         if( cbp&0x30 ){
5862             int c;
5863             for( c = 0; c < 2; c++ ) {
5864                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5865                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5866             }
5867         }
5868
5869         if( cbp&0x20 ) {
5870             int c, i;
5871             for( c = 0; c < 2; c++ ) {
5872                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5873                 for( i = 0; i < 4; i++ ) {
5874                     const int index = 16 + 4 * c + i;
5875                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5876                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5877                 }
5878             }
5879         } else {
5880             uint8_t * const nnz= &h->non_zero_count_cache[0];
5881             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5882             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5883         }
5884     } else {
5885         uint8_t * const nnz= &h->non_zero_count_cache[0];
5886         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5887         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5888         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5889         h->last_qscale_diff = 0;
5890     }
5891
5892     s->current_picture.qscale_table[mb_xy]= s->qscale;
5893     write_back_non_zero_count(h);
5894
5895     if(MB_MBAFF){
5896         h->ref_count[0] >>= 1;
5897         h->ref_count[1] >>= 1;
5898     }
5899
5900     return 0;
5901 }
5902
5903
5904 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5905     int i, d;
5906     const int index_a = qp + h->slice_alpha_c0_offset;
5907     const int alpha = (alpha_table+52)[index_a];
5908     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5909
5910     if( bS[0] < 4 ) {
5911         int8_t tc[4];
5912         for(i=0; i<4; i++)
5913             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5914         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5915     } else {
5916         /* 16px edge length, because bS=4 is triggered by being at
5917          * the edge of an intra MB, so all 4 bS are the same */
5918             for( d = 0; d < 16; d++ ) {
5919                 const int p0 = pix[-1];
5920                 const int p1 = pix[-2];
5921                 const int p2 = pix[-3];
5922
5923                 const int q0 = pix[0];
5924                 const int q1 = pix[1];
5925                 const int q2 = pix[2];
5926
5927                 if( FFABS( p0 - q0 ) < alpha &&
5928                     FFABS( p1 - p0 ) < beta &&
5929                     FFABS( q1 - q0 ) < beta ) {
5930
5931                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5932                         if( FFABS( p2 - p0 ) < beta)
5933                         {
5934                             const int p3 = pix[-4];
5935                             /* p0', p1', p2' */
5936                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5937                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5938                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5939                         } else {
5940                             /* p0' */
5941                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5942                         }
5943                         if( FFABS( q2 - q0 ) < beta)
5944                         {
5945                             const int q3 = pix[3];
5946                             /* q0', q1', q2' */
5947                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5948                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5949                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5950                         } else {
5951                             /* q0' */
5952                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5953                         }
5954                     }else{
5955                         /* p0', q0' */
5956                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5957                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5958                     }
5959                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5960                 }
5961                 pix += stride;
5962             }
5963     }
5964 }
5965 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5966     int i;
5967     const int index_a = qp + h->slice_alpha_c0_offset;
5968     const int alpha = (alpha_table+52)[index_a];
5969     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5970
5971     if( bS[0] < 4 ) {
5972         int8_t tc[4];
5973         for(i=0; i<4; i++)
5974             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5975         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5976     } else {
5977         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5978     }
5979 }
5980
5981 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5982     int i;
5983     for( i = 0; i < 16; i++, pix += stride) {
5984         int index_a;
5985         int alpha;
5986         int beta;
5987
5988         int qp_index;
5989         int bS_index = (i >> 1);
5990         if (!MB_FIELD) {
5991             bS_index &= ~1;
5992             bS_index |= (i & 1);
5993         }
5994
5995         if( bS[bS_index] == 0 ) {
5996             continue;
5997         }
5998
5999         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6000         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6001         alpha = (alpha_table+52)[index_a];
6002         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6003
6004         if( bS[bS_index] < 4 ) {
6005             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6006             const int p0 = pix[-1];
6007             const int p1 = pix[-2];
6008             const int p2 = pix[-3];
6009             const int q0 = pix[0];
6010             const int q1 = pix[1];
6011             const int q2 = pix[2];
6012
6013             if( FFABS( p0 - q0 ) < alpha &&
6014                 FFABS( p1 - p0 ) < beta &&
6015                 FFABS( q1 - q0 ) < beta ) {
6016                 int tc = tc0;
6017                 int i_delta;
6018
6019                 if( FFABS( p2 - p0 ) < beta ) {
6020                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6021                     tc++;
6022                 }
6023                 if( FFABS( q2 - q0 ) < beta ) {
6024                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6025                     tc++;
6026                 }
6027
6028                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6029                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6030                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6031                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6032             }
6033         }else{
6034             const int p0 = pix[-1];
6035             const int p1 = pix[-2];
6036             const int p2 = pix[-3];
6037
6038             const int q0 = pix[0];
6039             const int q1 = pix[1];
6040             const int q2 = pix[2];
6041
6042             if( FFABS( p0 - q0 ) < alpha &&
6043                 FFABS( p1 - p0 ) < beta &&
6044                 FFABS( q1 - q0 ) < beta ) {
6045
6046                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6047                     if( FFABS( p2 - p0 ) < beta)
6048                     {
6049                         const int p3 = pix[-4];
6050                         /* p0', p1', p2' */
6051                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6052                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6053                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6054                     } else {
6055                         /* p0' */
6056                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6057                     }
6058                     if( FFABS( q2 - q0 ) < beta)
6059                     {
6060                         const int q3 = pix[3];
6061                         /* q0', q1', q2' */
6062                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6063                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6064                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6065                     } else {
6066                         /* q0' */
6067                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6068                     }
6069                 }else{
6070                     /* p0', q0' */
6071                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6072                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6073                 }
6074                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6075             }
6076         }
6077     }
6078 }
6079 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6080     int i;
6081     for( i = 0; i < 8; i++, pix += stride) {
6082         int index_a;
6083         int alpha;
6084         int beta;
6085
6086         int qp_index;
6087         int bS_index = i;
6088
6089         if( bS[bS_index] == 0 ) {
6090             continue;
6091         }
6092
6093         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6094         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6095         alpha = (alpha_table+52)[index_a];
6096         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6097
6098         if( bS[bS_index] < 4 ) {
6099             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6100             const int p0 = pix[-1];
6101             const int p1 = pix[-2];
6102             const int q0 = pix[0];
6103             const int q1 = pix[1];
6104
6105             if( FFABS( p0 - q0 ) < alpha &&
6106                 FFABS( p1 - p0 ) < beta &&
6107                 FFABS( q1 - q0 ) < beta ) {
6108                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6109
6110                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6111                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6112                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6113             }
6114         }else{
6115             const int p0 = pix[-1];
6116             const int p1 = pix[-2];
6117             const int q0 = pix[0];
6118             const int q1 = pix[1];
6119
6120             if( FFABS( p0 - q0 ) < alpha &&
6121                 FFABS( p1 - p0 ) < beta &&
6122                 FFABS( q1 - q0 ) < beta ) {
6123
6124                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6125                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6126                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6127             }
6128         }
6129     }
6130 }
6131
6132 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6133     int i, d;
6134     const int index_a = qp + h->slice_alpha_c0_offset;
6135     const int alpha = (alpha_table+52)[index_a];
6136     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6137     const int pix_next  = stride;
6138
6139     if( bS[0] < 4 ) {
6140         int8_t tc[4];
6141         for(i=0; i<4; i++)
6142             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6143         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6144     } else {
6145         /* 16px edge length, see filter_mb_edgev */
6146             for( d = 0; d < 16; d++ ) {
6147                 const int p0 = pix[-1*pix_next];
6148                 const int p1 = pix[-2*pix_next];
6149                 const int p2 = pix[-3*pix_next];
6150                 const int q0 = pix[0];
6151                 const int q1 = pix[1*pix_next];
6152                 const int q2 = pix[2*pix_next];
6153
6154                 if( FFABS( p0 - q0 ) < alpha &&
6155                     FFABS( p1 - p0 ) < beta &&
6156                     FFABS( q1 - q0 ) < beta ) {
6157
6158                     const int p3 = pix[-4*pix_next];
6159                     const int q3 = pix[ 3*pix_next];
6160
6161                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6162                         if( FFABS( p2 - p0 ) < beta) {
6163                             /* p0', p1', p2' */
6164                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6165                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6166                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6167                         } else {
6168                             /* p0' */
6169                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6170                         }
6171                         if( FFABS( q2 - q0 ) < beta) {
6172                             /* q0', q1', q2' */
6173                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6174                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6175                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6176                         } else {
6177                             /* q0' */
6178                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6179                         }
6180                     }else{
6181                         /* p0', q0' */
6182                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6183                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6184                     }
6185                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6186                 }
6187                 pix++;
6188             }
6189     }
6190 }
6191
6192 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6193     int i;
6194     const int index_a = qp + h->slice_alpha_c0_offset;
6195     const int alpha = (alpha_table+52)[index_a];
6196     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6197
6198     if( bS[0] < 4 ) {
6199         int8_t tc[4];
6200         for(i=0; i<4; i++)
6201             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6202         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6203     } else {
6204         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6205     }
6206 }
6207
6208 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6209     MpegEncContext * const s = &h->s;
6210     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6211     int mb_xy, mb_type;
6212     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6213
6214     mb_xy = h->mb_xy;
6215
6216     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6217 1 ||
6218        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6219                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6220         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6221         return;
6222     }
6223     assert(!FRAME_MBAFF);
6224
6225     mb_type = s->current_picture.mb_type[mb_xy];
6226     qp = s->current_picture.qscale_table[mb_xy];
6227     qp0 = s->current_picture.qscale_table[mb_xy-1];
6228     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6229     qpc = get_chroma_qp( h, 0, qp );
6230     qpc0 = get_chroma_qp( h, 0, qp0 );
6231     qpc1 = get_chroma_qp( h, 0, qp1 );
6232     qp0 = (qp + qp0 + 1) >> 1;
6233     qp1 = (qp + qp1 + 1) >> 1;
6234     qpc0 = (qpc + qpc0 + 1) >> 1;
6235     qpc1 = (qpc + qpc1 + 1) >> 1;
6236     qp_thresh = 15 - h->slice_alpha_c0_offset;
6237     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6238        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6239         return;
6240
6241     if( IS_INTRA(mb_type) ) {
6242         int16_t bS4[4] = {4,4,4,4};
6243         int16_t bS3[4] = {3,3,3,3};
6244         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6245         if( IS_8x8DCT(mb_type) ) {
6246             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6247             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6248             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6249             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6250         } else {
6251             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6252             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6253             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6254             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6255             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6256             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6257             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6258             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6259         }
6260         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6261         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6262         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6263         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6264         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6265         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6266         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6267         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6268         return;
6269     } else {
6270         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6271         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6272         int edges;
6273         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6274             edges = 4;
6275             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6276         } else {
6277             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6278                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6279             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6280                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6281                              ? 3 : 0;
6282             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6283             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6284             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6285                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6286         }
6287         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6288             bSv[0][0] = 0x0004000400040004ULL;
6289         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6290             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6291
6292 #define FILTER(hv,dir,edge)\
6293         if(bSv[dir][edge]) {\
6294             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6295             if(!(edge&1)) {\
6296                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6297                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6298             }\
6299         }
6300         if( edges == 1 ) {
6301             FILTER(v,0,0);
6302             FILTER(h,1,0);
6303         } else if( IS_8x8DCT(mb_type) ) {
6304             FILTER(v,0,0);
6305             FILTER(v,0,2);
6306             FILTER(h,1,0);
6307             FILTER(h,1,2);
6308         } else {
6309             FILTER(v,0,0);
6310             FILTER(v,0,1);
6311             FILTER(v,0,2);
6312             FILTER(v,0,3);
6313             FILTER(h,1,0);
6314             FILTER(h,1,1);
6315             FILTER(h,1,2);
6316             FILTER(h,1,3);
6317         }
6318 #undef FILTER
6319     }
6320 }
6321
6322 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6323     MpegEncContext * const s = &h->s;
6324     const int mb_xy= mb_x + mb_y*s->mb_stride;
6325     const int mb_type = s->current_picture.mb_type[mb_xy];
6326     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6327     int first_vertical_edge_done = 0;
6328     int dir;
6329
6330     //for sufficiently low qp, filtering wouldn't do anything
6331     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6332     if(!FRAME_MBAFF){
6333         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6334         int qp = s->current_picture.qscale_table[mb_xy];
6335         if(qp <= qp_thresh
6336            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6337            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6338             return;
6339         }
6340     }
6341
6342     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6343     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6344         int top_type, left_type[2];
6345         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6346         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6347         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6348
6349         if(IS_8x8DCT(top_type)){
6350             h->non_zero_count_cache[4+8*0]=
6351             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6352             h->non_zero_count_cache[6+8*0]=
6353             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6354         }
6355         if(IS_8x8DCT(left_type[0])){
6356             h->non_zero_count_cache[3+8*1]=
6357             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6358         }
6359         if(IS_8x8DCT(left_type[1])){
6360             h->non_zero_count_cache[3+8*3]=
6361             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6362         }
6363
6364         if(IS_8x8DCT(mb_type)){
6365             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6366             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6367
6368             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6369             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6370
6371             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6372             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6373
6374             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6375             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6376         }
6377     }
6378
6379     if (FRAME_MBAFF
6380             // left mb is in picture
6381             && h->slice_table[mb_xy-1] != 255
6382             // and current and left pair do not have the same interlaced type
6383             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6384             // and left mb is in the same slice if deblocking_filter == 2
6385             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6386         /* First vertical edge is different in MBAFF frames
6387          * There are 8 different bS to compute and 2 different Qp
6388          */
6389         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6390         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6391         int16_t bS[8];
6392         int qp[2];
6393         int bqp[2];
6394         int rqp[2];
6395         int mb_qp, mbn0_qp, mbn1_qp;
6396         int i;
6397         first_vertical_edge_done = 1;
6398
6399         if( IS_INTRA(mb_type) )
6400             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6401         else {
6402             for( i = 0; i < 8; i++ ) {
6403                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6404
6405                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6406                     bS[i] = 4;
6407                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6408                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6409                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6410                     bS[i] = 2;
6411                 else
6412                     bS[i] = 1;
6413             }
6414         }
6415
6416         mb_qp = s->current_picture.qscale_table[mb_xy];
6417         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6418         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6419         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6420         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6421                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6422         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6423                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6424         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6425         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6426                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6427         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6428                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6429
6430         /* Filter edge */
6431         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6432         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6433         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6434         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6435         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6436     }
6437     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6438     for( dir = 0; dir < 2; dir++ )
6439     {
6440         int edge;
6441         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6442         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6443         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6444         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6445         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6446
6447         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6448                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6449         // how often to recheck mv-based bS when iterating between edges
6450         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6451                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6452         // how often to recheck mv-based bS when iterating along each edge
6453         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6454
6455         if (first_vertical_edge_done) {
6456             start = 1;
6457             first_vertical_edge_done = 0;
6458         }
6459
6460         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6461             start = 1;
6462
6463         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6464             && !IS_INTERLACED(mb_type)
6465             && IS_INTERLACED(mbm_type)
6466             ) {
6467             // This is a special case in the norm where the filtering must
6468             // be done twice (one each of the field) even if we are in a
6469             // frame macroblock.
6470             //
6471             static const int nnz_idx[4] = {4,5,6,3};
6472             unsigned int tmp_linesize   = 2 *   linesize;
6473             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6474             int mbn_xy = mb_xy - 2 * s->mb_stride;
6475             int qp;
6476             int i, j;
6477             int16_t bS[4];
6478
6479             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6480                 if( IS_INTRA(mb_type) ||
6481                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6482                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6483                 } else {
6484                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6485                     for( i = 0; i < 4; i++ ) {
6486                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6487                             mbn_nnz[nnz_idx[i]] != 0 )
6488                             bS[i] = 2;
6489                         else
6490                             bS[i] = 1;
6491                     }
6492                 }
6493                 // Do not use s->qscale as luma quantizer because it has not the same
6494                 // value in IPCM macroblocks.
6495                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6496                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6497                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6498                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6499                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6500                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6501                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6502                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6503             }
6504
6505             start = 1;
6506         }
6507
6508         /* Calculate bS */
6509         for( edge = start; edge < edges; edge++ ) {
6510             /* mbn_xy: neighbor macroblock */
6511             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6512             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6513             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6514             int16_t bS[4];
6515             int qp;
6516
6517             if( (edge&1) && IS_8x8DCT(mb_type) )
6518                 continue;
6519
6520             if( IS_INTRA(mb_type) ||
6521                 IS_INTRA(mbn_type) ) {
6522                 int value;
6523                 if (edge == 0) {
6524                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6525                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6526                     ) {
6527                         value = 4;
6528                     } else {
6529                         value = 3;
6530                     }
6531                 } else {
6532                     value = 3;
6533                 }
6534                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6535             } else {
6536                 int i, l;
6537                 int mv_done;
6538
6539                 if( edge & mask_edge ) {
6540                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6541                     mv_done = 1;
6542                 }
6543                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6544                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6545                     mv_done = 1;
6546                 }
6547                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6548                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6549                     int bn_idx= b_idx - (dir ? 8:1);
6550                     int v = 0;
6551
6552                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6553                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6554                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6555                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6556                     }
6557
6558                     if(h->slice_type_nos == FF_B_TYPE && v){
6559                         v=0;
6560                         for( l = 0; !v && l < 2; l++ ) {
6561                             int ln= 1-l;
6562                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6563                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6564                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6565                         }
6566                     }
6567
6568                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6569                     mv_done = 1;
6570                 }
6571                 else
6572                     mv_done = 0;
6573
6574                 for( i = 0; i < 4; i++ ) {
6575                     int x = dir == 0 ? edge : i;
6576                     int y = dir == 0 ? i    : edge;
6577                     int b_idx= 8 + 4 + x + 8*y;
6578                     int bn_idx= b_idx - (dir ? 8:1);
6579
6580                     if( h->non_zero_count_cache[b_idx] != 0 ||
6581                         h->non_zero_count_cache[bn_idx] != 0 ) {
6582                         bS[i] = 2;
6583                     }
6584                     else if(!mv_done)
6585                     {
6586                         bS[i] = 0;
6587                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6588                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6589                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6590                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6591                                 bS[i] = 1;
6592                                 break;
6593                             }
6594                         }
6595
6596                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6597                             bS[i] = 0;
6598                             for( l = 0; l < 2; l++ ) {
6599                                 int ln= 1-l;
6600                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6601                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6602                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6603                                     bS[i] = 1;
6604                                     break;
6605                                 }
6606                             }
6607                         }
6608                     }
6609                 }
6610
6611                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6612                     continue;
6613             }
6614
6615             /* Filter edge */
6616             // Do not use s->qscale as luma quantizer because it has not the same
6617             // value in IPCM macroblocks.
6618             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6619             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6620             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6621             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6622             if( dir == 0 ) {
6623                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6624                 if( (edge&1) == 0 ) {
6625                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6626                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6627                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6628                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6629                 }
6630             } else {
6631                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6632                 if( (edge&1) == 0 ) {
6633                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6634                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6635                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6636                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6637                 }
6638             }
6639         }
6640     }
6641 }
6642
6643 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6644     MpegEncContext * const s = &h->s;
6645     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6646
6647     s->mb_skip_run= -1;
6648
6649     if( h->pps.cabac ) {
6650         int i;
6651
6652         /* realign */
6653         align_get_bits( &s->gb );
6654
6655         /* init cabac */
6656         ff_init_cabac_states( &h->cabac);
6657         ff_init_cabac_decoder( &h->cabac,
6658                                s->gb.buffer + get_bits_count(&s->gb)/8,
6659                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6660         /* calculate pre-state */
6661         for( i= 0; i < 460; i++ ) {
6662             int pre;
6663             if( h->slice_type_nos == FF_I_TYPE )
6664                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6665             else
6666                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6667
6668             if( pre <= 63 )
6669                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6670             else
6671                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6672         }
6673
6674         for(;;){
6675 //START_TIMER
6676             int ret = decode_mb_cabac(h);
6677             int eos;
6678 //STOP_TIMER("decode_mb_cabac")
6679
6680             if(ret>=0) hl_decode_mb(h);
6681
6682             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6683                 s->mb_y++;
6684
6685                 if(ret>=0) ret = decode_mb_cabac(h);
6686
6687                 if(ret>=0) hl_decode_mb(h);
6688                 s->mb_y--;
6689             }
6690             eos = get_cabac_terminate( &h->cabac );
6691
6692             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6693                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6694                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6695                 return -1;
6696             }
6697
6698             if( ++s->mb_x >= s->mb_width ) {
6699                 s->mb_x = 0;
6700                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6701                 ++s->mb_y;
6702                 if(FIELD_OR_MBAFF_PICTURE) {
6703                     ++s->mb_y;
6704                 }
6705             }
6706
6707             if( eos || s->mb_y >= s->mb_height ) {
6708                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6709                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6710                 return 0;
6711             }
6712         }
6713
6714     } else {
6715         for(;;){
6716             int ret = decode_mb_cavlc(h);
6717
6718             if(ret>=0) hl_decode_mb(h);
6719
6720             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6721                 s->mb_y++;
6722                 ret = decode_mb_cavlc(h);
6723
6724                 if(ret>=0) hl_decode_mb(h);
6725                 s->mb_y--;
6726             }
6727
6728             if(ret<0){
6729                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6730                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6731
6732                 return -1;
6733             }
6734
6735             if(++s->mb_x >= s->mb_width){
6736                 s->mb_x=0;
6737                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6738                 ++s->mb_y;
6739                 if(FIELD_OR_MBAFF_PICTURE) {
6740                     ++s->mb_y;
6741                 }
6742                 if(s->mb_y >= s->mb_height){
6743                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6744
6745                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6746                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6747
6748                         return 0;
6749                     }else{
6750                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6751
6752                         return -1;
6753                     }
6754                 }
6755             }
6756
6757             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6758                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6759                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6760                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6761
6762                     return 0;
6763                 }else{
6764                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6765
6766                     return -1;
6767                 }
6768             }
6769         }
6770     }
6771
6772 #if 0
6773     for(;s->mb_y < s->mb_height; s->mb_y++){
6774         for(;s->mb_x < s->mb_width; s->mb_x++){
6775             int ret= decode_mb(h);
6776
6777             hl_decode_mb(h);
6778
6779             if(ret<0){
6780                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6781                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6782
6783                 return -1;
6784             }
6785
6786             if(++s->mb_x >= s->mb_width){
6787                 s->mb_x=0;
6788                 if(++s->mb_y >= s->mb_height){
6789                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6790                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6791
6792                         return 0;
6793                     }else{
6794                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6795
6796                         return -1;
6797                     }
6798                 }
6799             }
6800
6801             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6802                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6803                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6804
6805                     return 0;
6806                 }else{
6807                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6808
6809                     return -1;
6810                 }
6811             }
6812         }
6813         s->mb_x=0;
6814         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6815     }
6816 #endif
6817     return -1; //not reached
6818 }
6819
6820 static int decode_unregistered_user_data(H264Context *h, int size){
6821     MpegEncContext * const s = &h->s;
6822     uint8_t user_data[16+256];
6823     int e, build, i;
6824
6825     if(size<16)
6826         return -1;
6827
6828     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6829         user_data[i]= get_bits(&s->gb, 8);
6830     }
6831
6832     user_data[i]= 0;
6833     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6834     if(e==1 && build>=0)
6835         h->x264_build= build;
6836
6837     if(s->avctx->debug & FF_DEBUG_BUGS)
6838         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6839
6840     for(; i<size; i++)
6841         skip_bits(&s->gb, 8);
6842
6843     return 0;
6844 }
6845
6846 static int decode_sei(H264Context *h){
6847     MpegEncContext * const s = &h->s;
6848
6849     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6850         int size, type;
6851
6852         type=0;
6853         do{
6854             type+= show_bits(&s->gb, 8);
6855         }while(get_bits(&s->gb, 8) == 255);
6856
6857         size=0;
6858         do{
6859             size+= show_bits(&s->gb, 8);
6860         }while(get_bits(&s->gb, 8) == 255);
6861
6862         switch(type){
6863         case 5:
6864             if(decode_unregistered_user_data(h, size) < 0)
6865                 return -1;
6866             break;
6867         default:
6868             skip_bits(&s->gb, 8*size);
6869         }
6870
6871         //FIXME check bits here
6872         align_get_bits(&s->gb);
6873     }
6874
6875     return 0;
6876 }
6877
6878 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6879     MpegEncContext * const s = &h->s;
6880     int cpb_count, i;
6881     cpb_count = get_ue_golomb(&s->gb) + 1;
6882     get_bits(&s->gb, 4); /* bit_rate_scale */
6883     get_bits(&s->gb, 4); /* cpb_size_scale */
6884     for(i=0; i<cpb_count; i++){
6885         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6886         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6887         get_bits1(&s->gb);     /* cbr_flag */
6888     }
6889     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6890     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6891     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6892     get_bits(&s->gb, 5); /* time_offset_length */
6893 }
6894
6895 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6896     MpegEncContext * const s = &h->s;
6897     int aspect_ratio_info_present_flag;
6898     unsigned int aspect_ratio_idc;
6899     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6900
6901     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6902
6903     if( aspect_ratio_info_present_flag ) {
6904         aspect_ratio_idc= get_bits(&s->gb, 8);
6905         if( aspect_ratio_idc == EXTENDED_SAR ) {
6906             sps->sar.num= get_bits(&s->gb, 16);
6907             sps->sar.den= get_bits(&s->gb, 16);
6908         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
6909             sps->sar=  pixel_aspect[aspect_ratio_idc];
6910         }else{
6911             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6912             return -1;
6913         }
6914     }else{
6915         sps->sar.num=
6916         sps->sar.den= 0;
6917     }
6918 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6919
6920     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6921         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6922     }
6923
6924     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6925         get_bits(&s->gb, 3);    /* video_format */
6926         get_bits1(&s->gb);      /* video_full_range_flag */
6927         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6928             get_bits(&s->gb, 8); /* colour_primaries */
6929             get_bits(&s->gb, 8); /* transfer_characteristics */
6930             get_bits(&s->gb, 8); /* matrix_coefficients */
6931         }
6932     }
6933
6934     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6935         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6936         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6937     }
6938
6939     sps->timing_info_present_flag = get_bits1(&s->gb);
6940     if(sps->timing_info_present_flag){
6941         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6942         sps->time_scale = get_bits_long(&s->gb, 32);
6943         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6944     }
6945
6946     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6947     if(nal_hrd_parameters_present_flag)
6948         decode_hrd_parameters(h, sps);
6949     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6950     if(vcl_hrd_parameters_present_flag)
6951         decode_hrd_parameters(h, sps);
6952     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6953         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6954     get_bits1(&s->gb);         /* pic_struct_present_flag */
6955
6956     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6957     if(sps->bitstream_restriction_flag){
6958         unsigned int num_reorder_frames;
6959         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6960         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6961         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6962         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6963         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6964         num_reorder_frames= get_ue_golomb(&s->gb);
6965         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6966
6967         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6968             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6969             return -1;
6970         }
6971
6972         sps->num_reorder_frames= num_reorder_frames;
6973     }
6974
6975     return 0;
6976 }
6977
6978 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6979                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6980     MpegEncContext * const s = &h->s;
6981     int i, last = 8, next = 8;
6982     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6983     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6984         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6985     else
6986     for(i=0;i<size;i++){
6987         if(next)
6988             next = (last + get_se_golomb(&s->gb)) & 0xff;
6989         if(!i && !next){ /* matrix not written, we use the preset one */
6990             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6991             break;
6992         }
6993         last = factors[scan[i]] = next ? next : last;
6994     }
6995 }
6996
6997 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6998                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6999     MpegEncContext * const s = &h->s;
7000     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7001     const uint8_t *fallback[4] = {
7002         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7003         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7004         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7005         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7006     };
7007     if(get_bits1(&s->gb)){
7008         sps->scaling_matrix_present |= is_sps;
7009         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7010         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7011         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7012         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7013         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7014         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7015         if(is_sps || pps->transform_8x8_mode){
7016             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7017             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7018         }
7019     } else if(fallback_sps) {
7020         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7021         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7022     }
7023 }
7024
7025 /**
7026  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7027  */
7028 static void *
7029 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7030                     const size_t size, const char *name)
7031 {
7032     if(id>=max) {
7033         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7034         return NULL;
7035     }
7036
7037     if(!vec[id]) {
7038         vec[id] = av_mallocz(size);
7039         if(vec[id] == NULL)
7040             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7041     }
7042     return vec[id];
7043 }
7044
7045 static inline int decode_seq_parameter_set(H264Context *h){
7046     MpegEncContext * const s = &h->s;
7047     int profile_idc, level_idc;
7048     unsigned int sps_id, tmp, mb_width, mb_height;
7049     int i;
7050     SPS *sps;
7051
7052     profile_idc= get_bits(&s->gb, 8);
7053     get_bits1(&s->gb);   //constraint_set0_flag
7054     get_bits1(&s->gb);   //constraint_set1_flag
7055     get_bits1(&s->gb);   //constraint_set2_flag
7056     get_bits1(&s->gb);   //constraint_set3_flag
7057     get_bits(&s->gb, 4); // reserved
7058     level_idc= get_bits(&s->gb, 8);
7059     sps_id= get_ue_golomb(&s->gb);
7060
7061     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7062     if(sps == NULL)
7063         return -1;
7064
7065     sps->profile_idc= profile_idc;
7066     sps->level_idc= level_idc;
7067
7068     if(sps->profile_idc >= 100){ //high profile
7069         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7070         if(sps->chroma_format_idc == 3)
7071             get_bits1(&s->gb);  //residual_color_transform_flag
7072         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7073         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7074         sps->transform_bypass = get_bits1(&s->gb);
7075         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7076     }else{
7077         sps->scaling_matrix_present = 0;
7078         sps->chroma_format_idc= 1;
7079     }
7080
7081     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7082     sps->poc_type= get_ue_golomb(&s->gb);
7083
7084     if(sps->poc_type == 0){ //FIXME #define
7085         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7086     } else if(sps->poc_type == 1){//FIXME #define
7087         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7088         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7089         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7090         tmp= get_ue_golomb(&s->gb);
7091
7092         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7093             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7094             return -1;
7095         }
7096         sps->poc_cycle_length= tmp;
7097
7098         for(i=0; i<sps->poc_cycle_length; i++)
7099             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7100     }else if(sps->poc_type != 2){
7101         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7102         return -1;
7103     }
7104
7105     tmp= get_ue_golomb(&s->gb);
7106     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7107         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7108         return -1;
7109     }
7110     sps->ref_frame_count= tmp;
7111     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7112     mb_width= get_ue_golomb(&s->gb) + 1;
7113     mb_height= get_ue_golomb(&s->gb) + 1;
7114     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7115        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7116         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7117         return -1;
7118     }
7119     sps->mb_width = mb_width;
7120     sps->mb_height= mb_height;
7121
7122     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7123     if(!sps->frame_mbs_only_flag)
7124         sps->mb_aff= get_bits1(&s->gb);
7125     else
7126         sps->mb_aff= 0;
7127
7128     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7129
7130 #ifndef ALLOW_INTERLACE
7131     if(sps->mb_aff)
7132         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7133 #endif
7134     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7135         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7136
7137     sps->crop= get_bits1(&s->gb);
7138     if(sps->crop){
7139         sps->crop_left  = get_ue_golomb(&s->gb);
7140         sps->crop_right = get_ue_golomb(&s->gb);
7141         sps->crop_top   = get_ue_golomb(&s->gb);
7142         sps->crop_bottom= get_ue_golomb(&s->gb);
7143         if(sps->crop_left || sps->crop_top){
7144             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7145         }
7146         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7147             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7148         }
7149     }else{
7150         sps->crop_left  =
7151         sps->crop_right =
7152         sps->crop_top   =
7153         sps->crop_bottom= 0;
7154     }
7155
7156     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7157     if( sps->vui_parameters_present_flag )
7158         decode_vui_parameters(h, sps);
7159
7160     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7161         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7162                sps_id, sps->profile_idc, sps->level_idc,
7163                sps->poc_type,
7164                sps->ref_frame_count,
7165                sps->mb_width, sps->mb_height,
7166                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7167                sps->direct_8x8_inference_flag ? "8B8" : "",
7168                sps->crop_left, sps->crop_right,
7169                sps->crop_top, sps->crop_bottom,
7170                sps->vui_parameters_present_flag ? "VUI" : "",
7171                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7172                );
7173     }
7174     return 0;
7175 }
7176
7177 static void
7178 build_qp_table(PPS *pps, int t, int index)
7179 {
7180     int i;
7181     for(i = 0; i < 52; i++)
7182         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7183 }
7184
7185 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7186     MpegEncContext * const s = &h->s;
7187     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7188     PPS *pps;
7189
7190     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7191     if(pps == NULL)
7192         return -1;
7193
7194     tmp= get_ue_golomb(&s->gb);
7195     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7196         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7197         return -1;
7198     }
7199     pps->sps_id= tmp;
7200
7201     pps->cabac= get_bits1(&s->gb);
7202     pps->pic_order_present= get_bits1(&s->gb);
7203     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7204     if(pps->slice_group_count > 1 ){
7205         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7206         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7207         switch(pps->mb_slice_group_map_type){
7208         case 0:
7209 #if 0
7210 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7211 |    run_length[ i ]                                |1  |ue(v)   |
7212 #endif
7213             break;
7214         case 2:
7215 #if 0
7216 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7217 |{                                                  |   |        |
7218 |    top_left_mb[ i ]                               |1  |ue(v)   |
7219 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7220 |   }                                               |   |        |
7221 #endif
7222             break;
7223         case 3:
7224         case 4:
7225         case 5:
7226 #if 0
7227 |   slice_group_change_direction_flag               |1  |u(1)    |
7228 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7229 #endif
7230             break;
7231         case 6:
7232 #if 0
7233 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7234 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7235 |)                                                  |   |        |
7236 |    slice_group_id[ i ]                            |1  |u(v)    |
7237 #endif
7238             break;
7239         }
7240     }
7241     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7242     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7243     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7244         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7245         pps->ref_count[0]= pps->ref_count[1]= 1;
7246         return -1;
7247     }
7248
7249     pps->weighted_pred= get_bits1(&s->gb);
7250     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7251     pps->init_qp= get_se_golomb(&s->gb) + 26;
7252     pps->init_qs= get_se_golomb(&s->gb) + 26;
7253     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7254     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7255     pps->constrained_intra_pred= get_bits1(&s->gb);
7256     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7257
7258     pps->transform_8x8_mode= 0;
7259     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7260     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7261     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7262
7263     if(get_bits_count(&s->gb) < bit_length){
7264         pps->transform_8x8_mode= get_bits1(&s->gb);
7265         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7266         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7267     } else {
7268         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7269     }
7270
7271     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7272     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7273     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7274         h->pps.chroma_qp_diff= 1;
7275
7276     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7277         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7278                pps_id, pps->sps_id,
7279                pps->cabac ? "CABAC" : "CAVLC",
7280                pps->slice_group_count,
7281                pps->ref_count[0], pps->ref_count[1],
7282                pps->weighted_pred ? "weighted" : "",
7283                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7284                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7285                pps->constrained_intra_pred ? "CONSTR" : "",
7286                pps->redundant_pic_cnt_present ? "REDU" : "",
7287                pps->transform_8x8_mode ? "8x8DCT" : ""
7288                );
7289     }
7290
7291     return 0;
7292 }
7293
7294 /**
7295  * Call decode_slice() for each context.
7296  *
7297  * @param h h264 master context
7298  * @param context_count number of contexts to execute
7299  */
7300 static void execute_decode_slices(H264Context *h, int context_count){
7301     MpegEncContext * const s = &h->s;
7302     AVCodecContext * const avctx= s->avctx;
7303     H264Context *hx;
7304     int i;
7305
7306     if(context_count == 1) {
7307         decode_slice(avctx, h);
7308     } else {
7309         for(i = 1; i < context_count; i++) {
7310             hx = h->thread_context[i];
7311             hx->s.error_resilience = avctx->error_resilience;
7312             hx->s.error_count = 0;
7313         }
7314
7315         avctx->execute(avctx, (void *)decode_slice,
7316                        (void **)h->thread_context, NULL, context_count);
7317
7318         /* pull back stuff from slices to master context */
7319         hx = h->thread_context[context_count - 1];
7320         s->mb_x = hx->s.mb_x;
7321         s->mb_y = hx->s.mb_y;
7322         s->dropable = hx->s.dropable;
7323         s->picture_structure = hx->s.picture_structure;
7324         for(i = 1; i < context_count; i++)
7325             h->s.error_count += h->thread_context[i]->s.error_count;
7326     }
7327 }
7328
7329
7330 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7331     MpegEncContext * const s = &h->s;
7332     AVCodecContext * const avctx= s->avctx;
7333     int buf_index=0;
7334     H264Context *hx; ///< thread context
7335     int context_count = 0;
7336
7337     h->max_contexts = avctx->thread_count;
7338 #if 0
7339     int i;
7340     for(i=0; i<50; i++){
7341         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7342     }
7343 #endif
7344     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7345         h->current_slice = 0;
7346         if (!s->first_field)
7347             s->current_picture_ptr= NULL;
7348     }
7349
7350     for(;;){
7351         int consumed;
7352         int dst_length;
7353         int bit_length;
7354         const uint8_t *ptr;
7355         int i, nalsize = 0;
7356         int err;
7357
7358         if(h->is_avc) {
7359             if(buf_index >= buf_size) break;
7360             nalsize = 0;
7361             for(i = 0; i < h->nal_length_size; i++)
7362                 nalsize = (nalsize << 8) | buf[buf_index++];
7363             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7364                 if(nalsize == 1){
7365                     buf_index++;
7366                     continue;
7367                 }else{
7368                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7369                     break;
7370                 }
7371             }
7372         } else {
7373             // start code prefix search
7374             for(; buf_index + 3 < buf_size; buf_index++){
7375                 // This should always succeed in the first iteration.
7376                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7377                     break;
7378             }
7379
7380             if(buf_index+3 >= buf_size) break;
7381
7382             buf_index+=3;
7383         }
7384
7385         hx = h->thread_context[context_count];
7386
7387         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7388         if (ptr==NULL || dst_length < 0){
7389             return -1;
7390         }
7391         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7392             dst_length--;
7393         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7394
7395         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7396             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7397         }
7398
7399         if (h->is_avc && (nalsize != consumed)){
7400             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7401             consumed= nalsize;
7402         }
7403
7404         buf_index += consumed;
7405
7406         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7407            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7408             continue;
7409
7410       again:
7411         err = 0;
7412         switch(hx->nal_unit_type){
7413         case NAL_IDR_SLICE:
7414             if (h->nal_unit_type != NAL_IDR_SLICE) {
7415                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7416                 return -1;
7417             }
7418             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7419         case NAL_SLICE:
7420             init_get_bits(&hx->s.gb, ptr, bit_length);
7421             hx->intra_gb_ptr=
7422             hx->inter_gb_ptr= &hx->s.gb;
7423             hx->s.data_partitioning = 0;
7424
7425             if((err = decode_slice_header(hx, h)))
7426                break;
7427
7428             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7429             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7430                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7431                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7432                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7433                && avctx->skip_frame < AVDISCARD_ALL)
7434                 context_count++;
7435             break;
7436         case NAL_DPA:
7437             init_get_bits(&hx->s.gb, ptr, bit_length);
7438             hx->intra_gb_ptr=
7439             hx->inter_gb_ptr= NULL;
7440             hx->s.data_partitioning = 1;
7441
7442             err = decode_slice_header(hx, h);
7443             break;
7444         case NAL_DPB:
7445             init_get_bits(&hx->intra_gb, ptr, bit_length);
7446             hx->intra_gb_ptr= &hx->intra_gb;
7447             break;
7448         case NAL_DPC:
7449             init_get_bits(&hx->inter_gb, ptr, bit_length);
7450             hx->inter_gb_ptr= &hx->inter_gb;
7451
7452             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7453                && s->context_initialized
7454                && s->hurry_up < 5
7455                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7456                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7457                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7458                && avctx->skip_frame < AVDISCARD_ALL)
7459                 context_count++;
7460             break;
7461         case NAL_SEI:
7462             init_get_bits(&s->gb, ptr, bit_length);
7463             decode_sei(h);
7464             break;
7465         case NAL_SPS:
7466             init_get_bits(&s->gb, ptr, bit_length);
7467             decode_seq_parameter_set(h);
7468
7469             if(s->flags& CODEC_FLAG_LOW_DELAY)
7470                 s->low_delay=1;
7471
7472             if(avctx->has_b_frames < 2)
7473                 avctx->has_b_frames= !s->low_delay;
7474             break;
7475         case NAL_PPS:
7476             init_get_bits(&s->gb, ptr, bit_length);
7477
7478             decode_picture_parameter_set(h, bit_length);
7479
7480             break;
7481         case NAL_AUD:
7482         case NAL_END_SEQUENCE:
7483         case NAL_END_STREAM:
7484         case NAL_FILLER_DATA:
7485         case NAL_SPS_EXT:
7486         case NAL_AUXILIARY_SLICE:
7487             break;
7488         default:
7489             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7490         }
7491
7492         if(context_count == h->max_contexts) {
7493             execute_decode_slices(h, context_count);
7494             context_count = 0;
7495         }
7496
7497         if (err < 0)
7498             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7499         else if(err == 1) {
7500             /* Slice could not be decoded in parallel mode, copy down
7501              * NAL unit stuff to context 0 and restart. Note that
7502              * rbsp_buffer is not transferred, but since we no longer
7503              * run in parallel mode this should not be an issue. */
7504             h->nal_unit_type = hx->nal_unit_type;
7505             h->nal_ref_idc   = hx->nal_ref_idc;
7506             hx = h;
7507             goto again;
7508         }
7509     }
7510     if(context_count)
7511         execute_decode_slices(h, context_count);
7512     return buf_index;
7513 }
7514
7515 /**
7516  * returns the number of bytes consumed for building the current frame
7517  */
7518 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7519         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7520         if(pos+10>buf_size) pos=buf_size; // oops ;)
7521
7522         return pos;
7523 }
7524
7525 static int decode_frame(AVCodecContext *avctx,
7526                              void *data, int *data_size,
7527                              const uint8_t *buf, int buf_size)
7528 {
7529     H264Context *h = avctx->priv_data;
7530     MpegEncContext *s = &h->s;
7531     AVFrame *pict = data;
7532     int buf_index;
7533
7534     s->flags= avctx->flags;
7535     s->flags2= avctx->flags2;
7536
7537    /* end of stream, output what is still in the buffers */
7538     if (buf_size == 0) {
7539         Picture *out;
7540         int i, out_idx;
7541
7542 //FIXME factorize this with the output code below
7543         out = h->delayed_pic[0];
7544         out_idx = 0;
7545         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7546             if(h->delayed_pic[i]->poc < out->poc){
7547                 out = h->delayed_pic[i];
7548                 out_idx = i;
7549             }
7550
7551         for(i=out_idx; h->delayed_pic[i]; i++)
7552             h->delayed_pic[i] = h->delayed_pic[i+1];
7553
7554         if(out){
7555             *data_size = sizeof(AVFrame);
7556             *pict= *(AVFrame*)out;
7557         }
7558
7559         return 0;
7560     }
7561
7562     if(h->is_avc && !h->got_avcC) {
7563         int i, cnt, nalsize;
7564         unsigned char *p = avctx->extradata;
7565         if(avctx->extradata_size < 7) {
7566             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7567             return -1;
7568         }
7569         if(*p != 1) {
7570             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7571             return -1;
7572         }
7573         /* sps and pps in the avcC always have length coded with 2 bytes,
7574            so put a fake nal_length_size = 2 while parsing them */
7575         h->nal_length_size = 2;
7576         // Decode sps from avcC
7577         cnt = *(p+5) & 0x1f; // Number of sps
7578         p += 6;
7579         for (i = 0; i < cnt; i++) {
7580             nalsize = AV_RB16(p) + 2;
7581             if(decode_nal_units(h, p, nalsize) < 0) {
7582                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7583                 return -1;
7584             }
7585             p += nalsize;
7586         }
7587         // Decode pps from avcC
7588         cnt = *(p++); // Number of pps
7589         for (i = 0; i < cnt; i++) {
7590             nalsize = AV_RB16(p) + 2;
7591             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7592                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7593                 return -1;
7594             }
7595             p += nalsize;
7596         }
7597         // Now store right nal length size, that will be use to parse all other nals
7598         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7599         // Do not reparse avcC
7600         h->got_avcC = 1;
7601     }
7602
7603     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7604         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7605             return -1;
7606     }
7607
7608     buf_index=decode_nal_units(h, buf, buf_size);
7609     if(buf_index < 0)
7610         return -1;
7611
7612     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7613         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7614         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7615         return -1;
7616     }
7617
7618     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7619         Picture *out = s->current_picture_ptr;
7620         Picture *cur = s->current_picture_ptr;
7621         int i, pics, cross_idr, out_of_order, out_idx;
7622
7623         s->mb_y= 0;
7624
7625         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7626         s->current_picture_ptr->pict_type= s->pict_type;
7627
7628         if(!s->dropable) {
7629             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7630             h->prev_poc_msb= h->poc_msb;
7631             h->prev_poc_lsb= h->poc_lsb;
7632         }
7633         h->prev_frame_num_offset= h->frame_num_offset;
7634         h->prev_frame_num= h->frame_num;
7635
7636         /*
7637          * FIXME: Error handling code does not seem to support interlaced
7638          * when slices span multiple rows
7639          * The ff_er_add_slice calls don't work right for bottom
7640          * fields; they cause massive erroneous error concealing
7641          * Error marking covers both fields (top and bottom).
7642          * This causes a mismatched s->error_count
7643          * and a bad error table. Further, the error count goes to
7644          * INT_MAX when called for bottom field, because mb_y is
7645          * past end by one (callers fault) and resync_mb_y != 0
7646          * causes problems for the first MB line, too.
7647          */
7648         if (!FIELD_PICTURE)
7649             ff_er_frame_end(s);
7650
7651         MPV_frame_end(s);
7652
7653         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7654             /* Wait for second field. */
7655             *data_size = 0;
7656
7657         } else {
7658             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7659             /* Derive top_field_first from field pocs. */
7660             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7661
7662         //FIXME do something with unavailable reference frames
7663
7664             /* Sort B-frames into display order */
7665
7666             if(h->sps.bitstream_restriction_flag
7667                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7668                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7669                 s->low_delay = 0;
7670             }
7671
7672             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7673                && !h->sps.bitstream_restriction_flag){
7674                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7675                 s->low_delay= 0;
7676             }
7677
7678             pics = 0;
7679             while(h->delayed_pic[pics]) pics++;
7680
7681             assert(pics <= MAX_DELAYED_PIC_COUNT);
7682
7683             h->delayed_pic[pics++] = cur;
7684             if(cur->reference == 0)
7685                 cur->reference = DELAYED_PIC_REF;
7686
7687             out = h->delayed_pic[0];
7688             out_idx = 0;
7689             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7690                 if(h->delayed_pic[i]->poc < out->poc){
7691                     out = h->delayed_pic[i];
7692                     out_idx = i;
7693                 }
7694             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7695
7696             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7697
7698             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7699                 { }
7700             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7701                || (s->low_delay &&
7702                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7703                  || cur->pict_type == FF_B_TYPE)))
7704             {
7705                 s->low_delay = 0;
7706                 s->avctx->has_b_frames++;
7707             }
7708
7709             if(out_of_order || pics > s->avctx->has_b_frames){
7710                 out->reference &= ~DELAYED_PIC_REF;
7711                 for(i=out_idx; h->delayed_pic[i]; i++)
7712                     h->delayed_pic[i] = h->delayed_pic[i+1];
7713             }
7714             if(!out_of_order && pics > s->avctx->has_b_frames){
7715                 *data_size = sizeof(AVFrame);
7716
7717                 h->outputed_poc = out->poc;
7718                 *pict= *(AVFrame*)out;
7719             }else{
7720                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7721             }
7722         }
7723     }
7724
7725     assert(pict->data[0] || !*data_size);
7726     ff_print_debug_info(s, pict);
7727 //printf("out %d\n", (int)pict->data[0]);
7728 #if 0 //?
7729
7730     /* Return the Picture timestamp as the frame number */
7731     /* we subtract 1 because it is added on utils.c     */
7732     avctx->frame_number = s->picture_number - 1;
7733 #endif
7734     return get_consumed_bytes(s, buf_index, buf_size);
7735 }
7736 #if 0
7737 static inline void fill_mb_avail(H264Context *h){
7738     MpegEncContext * const s = &h->s;
7739     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7740
7741     if(s->mb_y){
7742         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7743         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7744         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7745     }else{
7746         h->mb_avail[0]=
7747         h->mb_avail[1]=
7748         h->mb_avail[2]= 0;
7749     }
7750     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7751     h->mb_avail[4]= 1; //FIXME move out
7752     h->mb_avail[5]= 0; //FIXME move out
7753 }
7754 #endif
7755
7756 #ifdef TEST
7757 #undef printf
7758 #undef random
7759 #define COUNT 8000
7760 #define SIZE (COUNT*40)
7761 int main(void){
7762     int i;
7763     uint8_t temp[SIZE];
7764     PutBitContext pb;
7765     GetBitContext gb;
7766 //    int int_temp[10000];
7767     DSPContext dsp;
7768     AVCodecContext avctx;
7769
7770     dsputil_init(&dsp, &avctx);
7771
7772     init_put_bits(&pb, temp, SIZE);
7773     printf("testing unsigned exp golomb\n");
7774     for(i=0; i<COUNT; i++){
7775         START_TIMER
7776         set_ue_golomb(&pb, i);
7777         STOP_TIMER("set_ue_golomb");
7778     }
7779     flush_put_bits(&pb);
7780
7781     init_get_bits(&gb, temp, 8*SIZE);
7782     for(i=0; i<COUNT; i++){
7783         int j, s;
7784
7785         s= show_bits(&gb, 24);
7786
7787         START_TIMER
7788         j= get_ue_golomb(&gb);
7789         if(j != i){
7790             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7791 //            return -1;
7792         }
7793         STOP_TIMER("get_ue_golomb");
7794     }
7795
7796
7797     init_put_bits(&pb, temp, SIZE);
7798     printf("testing signed exp golomb\n");
7799     for(i=0; i<COUNT; i++){
7800         START_TIMER
7801         set_se_golomb(&pb, i - COUNT/2);
7802         STOP_TIMER("set_se_golomb");
7803     }
7804     flush_put_bits(&pb);
7805
7806     init_get_bits(&gb, temp, 8*SIZE);
7807     for(i=0; i<COUNT; i++){
7808         int j, s;
7809
7810         s= show_bits(&gb, 24);
7811
7812         START_TIMER
7813         j= get_se_golomb(&gb);
7814         if(j != i - COUNT/2){
7815             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7816 //            return -1;
7817         }
7818         STOP_TIMER("get_se_golomb");
7819     }
7820
7821 #if 0
7822     printf("testing 4x4 (I)DCT\n");
7823
7824     DCTELEM block[16];
7825     uint8_t src[16], ref[16];
7826     uint64_t error= 0, max_error=0;
7827
7828     for(i=0; i<COUNT; i++){
7829         int j;
7830 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7831         for(j=0; j<16; j++){
7832             ref[j]= random()%255;
7833             src[j]= random()%255;
7834         }
7835
7836         h264_diff_dct_c(block, src, ref, 4);
7837
7838         //normalize
7839         for(j=0; j<16; j++){
7840 //            printf("%d ", block[j]);
7841             block[j]= block[j]*4;
7842             if(j&1) block[j]= (block[j]*4 + 2)/5;
7843             if(j&4) block[j]= (block[j]*4 + 2)/5;
7844         }
7845 //        printf("\n");
7846
7847         s->dsp.h264_idct_add(ref, block, 4);
7848 /*        for(j=0; j<16; j++){
7849             printf("%d ", ref[j]);
7850         }
7851         printf("\n");*/
7852
7853         for(j=0; j<16; j++){
7854             int diff= FFABS(src[j] - ref[j]);
7855
7856             error+= diff*diff;
7857             max_error= FFMAX(max_error, diff);
7858         }
7859     }
7860     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7861     printf("testing quantizer\n");
7862     for(qp=0; qp<52; qp++){
7863         for(i=0; i<16; i++)
7864             src1_block[i]= src2_block[i]= random()%255;
7865
7866     }
7867     printf("Testing NAL layer\n");
7868
7869     uint8_t bitstream[COUNT];
7870     uint8_t nal[COUNT*2];
7871     H264Context h;
7872     memset(&h, 0, sizeof(H264Context));
7873
7874     for(i=0; i<COUNT; i++){
7875         int zeros= i;
7876         int nal_length;
7877         int consumed;
7878         int out_length;
7879         uint8_t *out;
7880         int j;
7881
7882         for(j=0; j<COUNT; j++){
7883             bitstream[j]= (random() % 255) + 1;
7884         }
7885
7886         for(j=0; j<zeros; j++){
7887             int pos= random() % COUNT;
7888             while(bitstream[pos] == 0){
7889                 pos++;
7890                 pos %= COUNT;
7891             }
7892             bitstream[pos]=0;
7893         }
7894
7895         START_TIMER
7896
7897         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7898         if(nal_length<0){
7899             printf("encoding failed\n");
7900             return -1;
7901         }
7902
7903         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7904
7905         STOP_TIMER("NAL")
7906
7907         if(out_length != COUNT){
7908             printf("incorrect length %d %d\n", out_length, COUNT);
7909             return -1;
7910         }
7911
7912         if(consumed != nal_length){
7913             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7914             return -1;
7915         }
7916
7917         if(memcmp(bitstream, out, COUNT)){
7918             printf("mismatch\n");
7919             return -1;
7920         }
7921     }
7922 #endif
7923
7924     printf("Testing RBSP\n");
7925
7926
7927     return 0;
7928 }
7929 #endif /* TEST */
7930
7931
7932 static av_cold int decode_end(AVCodecContext *avctx)
7933 {
7934     H264Context *h = avctx->priv_data;
7935     MpegEncContext *s = &h->s;
7936
7937     av_freep(&h->rbsp_buffer[0]);
7938     av_freep(&h->rbsp_buffer[1]);
7939     free_tables(h); //FIXME cleanup init stuff perhaps
7940     MPV_common_end(s);
7941
7942 //    memset(h, 0, sizeof(H264Context));
7943
7944     return 0;
7945 }
7946
7947
7948 AVCodec h264_decoder = {
7949     "h264",
7950     CODEC_TYPE_VIDEO,
7951     CODEC_ID_H264,
7952     sizeof(H264Context),
7953     decode_init,
7954     NULL,
7955     decode_end,
7956     decode_frame,
7957     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7958     .flush= flush_dpb,
7959     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7960 };
7961
7962 #include "svq3.c"