git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= h->mb_xy;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type_nos == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     const int mb_xy= h->mb_xy;
 554
 555     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 556     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 557     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 558     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 559     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 560     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 561     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 562 }
 563
 564 /**
 565  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 566  */
 567 static inline int check_intra4x4_pred_mode(H264Context *h){
 568     MpegEncContext * const s = &h->s;
 569     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 570     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 571     int i;
 572
 573     if(!(h->top_samples_available&0x8000)){
 574         for(i=0; i<4; i++){
 575             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 576             if(status<0){
 577                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 578                 return -1;
 579             } else if(status){
 580                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 581             }
 582         }
 583     }
 584
 585     if(!(h->left_samples_available&0x8000)){
 586         for(i=0; i<4; i++){
 587             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 588             if(status<0){
 589                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 590                 return -1;
 591             } else if(status){
 592                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if(!(h->left_samples_available&0x8000)){
 622         mode= left[ mode ];
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665
 666     if(FRAME_MBAFF){
 667         // store all luma nnzs, for deblocking
 668         int v = 0, i;
 669         for(i=0; i<16; i++)
 670             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 671         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 672     }
 673 }
 674
 675 /**
 676  * gets the predicted number of non-zero coefficients.
 677  * @param n block index
 678  */
 679 static inline int pred_non_zero_count(H264Context *h, int n){
 680     const int index8= scan8[n];
 681     const int left= h->non_zero_count_cache[index8 - 1];
 682     const int top = h->non_zero_count_cache[index8 - 8];
 683     int i= left + top;
 684
 685     if(i<64) i= (i+1)>>1;
 686
 687     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 688
 689     return i&31;
 690 }
 691
 692 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 693     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 694     MpegEncContext *s = &h->s;
 695
 696     /* there is no consistent mapping of mvs to neighboring locations that will
 697      * make mbaff happy, so we can't move all this logic to fill_caches */
 698     if(FRAME_MBAFF){
 699         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 700         const int16_t *mv;
 701         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 702         *C = h->mv_cache[list][scan8[0]-2];
 703
 704         if(!MB_FIELD
 705            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 706             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 707             if(IS_INTERLACED(mb_types[topright_xy])){
 708 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 709                 const int x4 = X4, y4 = Y4;\
 710                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 711                 if(!USES_LIST(mb_type,list))\
 712                     return LIST_NOT_USED;\
 713                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 714                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 715                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 716                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 717
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 719             }
 720         }
 721         if(topright_ref == PART_NOT_AVAILABLE
 722            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 723            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 724             if(!MB_FIELD
 725                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 726                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 727             }
 728             if(MB_FIELD
 729                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 730                && i >= scan8[0]+8){
 731                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 732                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 733             }
 734         }
 735 #undef SET_DIAG_MV
 736     }
 737
 738     if(topright_ref != PART_NOT_AVAILABLE){
 739         *C= h->mv_cache[list][ i - 8 + part_width ];
 740         return topright_ref;
 741     }else{
 742         tprintf(s->avctx, "topright MV not available\n");
 743
 744         *C= h->mv_cache[list][ i - 8 - 1 ];
 745         return h->ref_cache[list][ i - 8 - 1 ];
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted MV.
 751  * @param n the block index
 752  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 753  * @param mx the x component of the predicted motion vector
 754  * @param my the y component of the predicted motion vector
 755  */
 756 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 757     const int index8= scan8[n];
 758     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 759     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 760     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 761     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 762     const int16_t * C;
 763     int diagonal_ref, match_count;
 764
 765     assert(part_width==1 || part_width==2 || part_width==4);
 766
 767 /* mv_cache
 768   B . . A T T T T
 769   U . . L . . , .
 770   U . . L . . . .
 771   U . . L . . , .
 772   . . . L . . . .
 773 */
 774
 775     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 776     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 777     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 778     if(match_count > 1){ //most common
 779         *mx= mid_pred(A[0], B[0], C[0]);
 780         *my= mid_pred(A[1], B[1], C[1]);
 781     }else if(match_count==1){
 782         if(left_ref==ref){
 783             *mx= A[0];
 784             *my= A[1];
 785         }else if(top_ref==ref){
 786             *mx= B[0];
 787             *my= B[1];
 788         }else{
 789             *mx= C[0];
 790             *my= C[1];
 791         }
 792     }else{
 793         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 794             *mx= A[0];
 795             *my= A[1];
 796         }else{
 797             *mx= mid_pred(A[0], B[0], C[0]);
 798             *my= mid_pred(A[1], B[1], C[1]);
 799         }
 800     }
 801
 802     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 803 }
 804
 805 /**
 806  * gets the directionally predicted 16x8 MV.
 807  * @param n the block index
 808  * @param mx the x component of the predicted motion vector
 809  * @param my the y component of the predicted motion vector
 810  */
 811 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 812     if(n==0){
 813         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 814         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 815
 816         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 817
 818         if(top_ref == ref){
 819             *mx= B[0];
 820             *my= B[1];
 821             return;
 822         }
 823     }else{
 824         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 825         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 826
 827         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 828
 829         if(left_ref == ref){
 830             *mx= A[0];
 831             *my= A[1];
 832             return;
 833         }
 834     }
 835
 836     //RARE
 837     pred_motion(h, n, 4, list, ref, mx, my);
 838 }
 839
 840 /**
 841  * gets the directionally predicted 8x16 MV.
 842  * @param n the block index
 843  * @param mx the x component of the predicted motion vector
 844  * @param my the y component of the predicted motion vector
 845  */
 846 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 847     if(n==0){
 848         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 849         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 850
 851         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 852
 853         if(left_ref == ref){
 854             *mx= A[0];
 855             *my= A[1];
 856             return;
 857         }
 858     }else{
 859         const int16_t * C;
 860         int diagonal_ref;
 861
 862         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 863
 864         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 865
 866         if(diagonal_ref == ref){
 867             *mx= C[0];
 868             *my= C[1];
 869             return;
 870         }
 871     }
 872
 873     //RARE
 874     pred_motion(h, n, 2, list, ref, mx, my);
 875 }
 876
 877 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 878     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 879     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 880
 881     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 882
 883     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 884        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 885        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 886
 887         *mx = *my = 0;
 888         return;
 889     }
 890
 891     pred_motion(h, 0, 4, 0, 0, mx, my);
 892
 893     return;
 894 }
 895
 896 static inline void direct_dist_scale_factor(H264Context * const h){
 897     const int poc = h->s.current_picture_ptr->poc;
 898     const int poc1 = h->ref_list[1][0].poc;
 899     int i;
 900     for(i=0; i<h->ref_count[0]; i++){
 901         int poc0 = h->ref_list[0][i].poc;
 902         int td = av_clip(poc1 - poc0, -128, 127);
 903         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 904             h->dist_scale_factor[i] = 256;
 905         }else{
 906             int tb = av_clip(poc - poc0, -128, 127);
 907             int tx = (16384 + (FFABS(td) >> 1)) / td;
 908             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 909         }
 910     }
 911     if(FRAME_MBAFF){
 912         for(i=0; i<h->ref_count[0]; i++){
 913             h->dist_scale_factor_field[2*i] =
 914             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 915         }
 916     }
 917 }
 918 static inline void direct_ref_list_init(H264Context * const h){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     Picture * const cur = s->current_picture_ptr;
 922     int list, i, j;
 923     if(cur->pict_type == FF_I_TYPE)
 924         cur->ref_count[0] = 0;
 925     if(cur->pict_type != FF_B_TYPE)
 926         cur->ref_count[1] = 0;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 931     }
 932     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 933         return;
 934     for(list=0; list<2; list++){
 935         for(i=0; i<ref1->ref_count[list]; i++){
 936             const int poc = ref1->ref_poc[list][i];
 937             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 938             for(j=0; j<h->ref_count[list]; j++)
 939                 if(h->ref_list[list][j].poc == poc){
 940                     h->map_col_to_list0[list][i] = j;
 941                     break;
 942                 }
 943         }
 944     }
 945     if(FRAME_MBAFF){
 946         for(list=0; list<2; list++){
 947             for(i=0; i<ref1->ref_count[list]; i++){
 948                 j = h->map_col_to_list0[list][i];
 949                 h->map_col_to_list0_field[list][2*i] = 2*j;
 950                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 951             }
 952         }
 953     }
 954 }
 955
 956 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 957     MpegEncContext * const s = &h->s;
 958     const int mb_xy =   h->mb_xy;
 959     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 960     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 961     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 962     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 963     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 964     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 965     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 966     const int is_b8x8 = IS_8X8(*mb_type);
 967     unsigned int sub_mb_type;
 968     int i8, i4;
 969
 970 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 971     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 972         /* FIXME save sub mb types from previous frames (or derive from MVs)
 973          * so we know exactly what block size to use */
 974         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 975         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 976     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 977         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 978         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 979     }else{
 980         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 981         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 982     }
 983     if(!is_b8x8)
 984         *mb_type |= MB_TYPE_DIRECT2;
 985     if(MB_FIELD)
 986         *mb_type |= MB_TYPE_INTERLACED;
 987
 988     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 989
 990     if(h->direct_spatial_mv_pred){
 991         int ref[2];
 992         int mv[2][2];
 993         int list;
 994
 995         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 996
 997         /* ref = min(neighbors) */
 998         for(list=0; list<2; list++){
 999             int refa = h->ref_cache[list][scan8[0] - 1];
1000             int refb = h->ref_cache[list][scan8[0] - 8];
1001             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1002             if(refc == -2)
1003                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1004             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1005             if(ref[list] < 0)
1006                 ref[list] = -1;
1007         }
1008
1009         if(ref[0] < 0 && ref[1] < 0){
1010             ref[0] = ref[1] = 0;
1011             mv[0][0] = mv[0][1] =
1012             mv[1][0] = mv[1][1] = 0;
1013         }else{
1014             for(list=0; list<2; list++){
1015                 if(ref[list] >= 0)
1016                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1017                 else
1018                     mv[list][0] = mv[list][1] = 0;
1019             }
1020         }
1021
1022         if(ref[1] < 0){
1023             if(!is_b8x8)
1024                 *mb_type &= ~MB_TYPE_L1;
1025             sub_mb_type &= ~MB_TYPE_L1;
1026         }else if(ref[0] < 0){
1027             if(!is_b8x8)
1028                 *mb_type &= ~MB_TYPE_L0;
1029             sub_mb_type &= ~MB_TYPE_L0;
1030         }
1031
1032         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1033             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1034             int mb_types_col[2];
1035             int b8_stride = h->b8_stride;
1036             int b4_stride = h->b_stride;
1037
1038             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1039
1040             if(IS_INTERLACED(*mb_type)){
1041                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1042                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1043                 if(s->mb_y&1){
1044                     l1ref0 -= 2*b8_stride;
1045                     l1ref1 -= 2*b8_stride;
1046                     l1mv0 -= 4*b4_stride;
1047                     l1mv1 -= 4*b4_stride;
1048                 }
1049                 b8_stride *= 3;
1050                 b4_stride *= 6;
1051             }else{
1052                 int cur_poc = s->current_picture_ptr->poc;
1053                 int *col_poc = h->ref_list[1]->field_poc;
1054                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1055                 int dy = 2*col_parity - (s->mb_y&1);
1056                 mb_types_col[0] =
1057                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1058                 l1ref0 += dy*b8_stride;
1059                 l1ref1 += dy*b8_stride;
1060                 l1mv0 += 2*dy*b4_stride;
1061                 l1mv1 += 2*dy*b4_stride;
1062                 b8_stride = 0;
1063             }
1064
1065             for(i8=0; i8<4; i8++){
1066                 int x8 = i8&1;
1067                 int y8 = i8>>1;
1068                 int xy8 = x8+y8*b8_stride;
1069                 int xy4 = 3*x8+y8*b4_stride;
1070                 int a=0, b=0;
1071
1072                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1073                     continue;
1074                 h->sub_mb_type[i8] = sub_mb_type;
1075
1076                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1077                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1078                 if(!IS_INTRA(mb_types_col[y8])
1079                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1080                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1081                     if(ref[0] > 0)
1082                         a= pack16to32(mv[0][0],mv[0][1]);
1083                     if(ref[1] > 0)
1084                         b= pack16to32(mv[1][0],mv[1][1]);
1085                 }else{
1086                     a= pack16to32(mv[0][0],mv[0][1]);
1087                     b= pack16to32(mv[1][0],mv[1][1]);
1088                 }
1089                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1090                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1091             }
1092         }else if(IS_16X16(*mb_type)){
1093             int a=0, b=0;
1094
1095             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1096             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1097             if(!IS_INTRA(mb_type_col)
1098                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1099                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1100                        && (h->x264_build>33 || !h->x264_build)))){
1101                 if(ref[0] > 0)
1102                     a= pack16to32(mv[0][0],mv[0][1]);
1103                 if(ref[1] > 0)
1104                     b= pack16to32(mv[1][0],mv[1][1]);
1105             }else{
1106                 a= pack16to32(mv[0][0],mv[0][1]);
1107                 b= pack16to32(mv[1][0],mv[1][1]);
1108             }
1109             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1110             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1111         }else{
1112             for(i8=0; i8<4; i8++){
1113                 const int x8 = i8&1;
1114                 const int y8 = i8>>1;
1115
1116                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1117                     continue;
1118                 h->sub_mb_type[i8] = sub_mb_type;
1119
1120                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1121                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1122                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1123                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1124
1125                 /* col_zero_flag */
1126                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1127                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1128                                                   && (h->x264_build>33 || !h->x264_build)))){
1129                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1130                     if(IS_SUB_8X8(sub_mb_type)){
1131                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1132                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1133                             if(ref[0] == 0)
1134                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1135                             if(ref[1] == 0)
1136                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1137                         }
1138                     }else
1139                     for(i4=0; i4<4; i4++){
1140                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1141                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1142                             if(ref[0] == 0)
1143                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1144                             if(ref[1] == 0)
1145                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1146                         }
1147                     }
1148                 }
1149             }
1150         }
1151     }else{ /* direct temporal mv pred */
1152         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1153         const int *dist_scale_factor = h->dist_scale_factor;
1154
1155         if(FRAME_MBAFF){
1156             if(IS_INTERLACED(*mb_type)){
1157                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1158                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1159                 dist_scale_factor = h->dist_scale_factor_field;
1160             }
1161             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1162                 /* FIXME assumes direct_8x8_inference == 1 */
1163                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1164                 int mb_types_col[2];
1165                 int y_shift;
1166
1167                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1168                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1169                          | (*mb_type & MB_TYPE_INTERLACED);
1170                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1171
1172                 if(IS_INTERLACED(*mb_type)){
1173                     /* frame to field scaling */
1174                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1175                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1176                     if(s->mb_y&1){
1177                         l1ref0 -= 2*h->b8_stride;
1178                         l1ref1 -= 2*h->b8_stride;
1179                         l1mv0 -= 4*h->b_stride;
1180                         l1mv1 -= 4*h->b_stride;
1181                     }
1182                     y_shift = 0;
1183
1184                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1185                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1186                        && !is_b8x8)
1187                         *mb_type |= MB_TYPE_16x8;
1188                     else
1189                         *mb_type |= MB_TYPE_8x8;
1190                 }else{
1191                     /* field to frame scaling */
1192                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1193                      * but in MBAFF, top and bottom POC are equal */
1194                     int dy = (s->mb_y&1) ? 1 : 2;
1195                     mb_types_col[0] =
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     l1ref0 += dy*h->b8_stride;
1198                     l1ref1 += dy*h->b8_stride;
1199                     l1mv0 += 2*dy*h->b_stride;
1200                     l1mv1 += 2*dy*h->b_stride;
1201                     y_shift = 2;
1202
1203                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1204                        && !is_b8x8)
1205                         *mb_type |= MB_TYPE_16x16;
1206                     else
1207                         *mb_type |= MB_TYPE_8x8;
1208                 }
1209
1210                 for(i8=0; i8<4; i8++){
1211                     const int x8 = i8&1;
1212                     const int y8 = i8>>1;
1213                     int ref0, scale;
1214                     const int16_t (*l1mv)[2]= l1mv0;
1215
1216                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1217                         continue;
1218                     h->sub_mb_type[i8] = sub_mb_type;
1219
1220                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1221                     if(IS_INTRA(mb_types_col[y8])){
1222                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1223                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1224                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                         continue;
1226                     }
1227
1228                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1229                     if(ref0 >= 0)
1230                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1231                     else{
1232                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1233                         l1mv= l1mv1;
1234                     }
1235                     scale = dist_scale_factor[ref0];
1236                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1237
1238                     {
1239                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1240                         int my_col = (mv_col[1]<<y_shift)/2;
1241                         int mx = (scale * mv_col[0] + 128) >> 8;
1242                         int my = (scale * my_col + 128) >> 8;
1243                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1244                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1245                     }
1246                 }
1247                 return;
1248             }
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col)){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1261                                                 : map_col_to_list0[1][l1ref1[0]];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col)){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         done = 1;
1958
1959         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1960                  &chroma_dc_coeff_token_len [0], 1, 1,
1961                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1962
1963         for(i=0; i<4; i++){
1964             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1965                      &coeff_token_len [i][0], 1, 1,
1966                      &coeff_token_bits[i][0], 1, 1, 1);
1967         }
1968
1969         for(i=0; i<3; i++){
1970             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1971                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1972                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1973         }
1974         for(i=0; i<15; i++){
1975             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1976                      &total_zeros_len [i][0], 1, 1,
1977                      &total_zeros_bits[i][0], 1, 1, 1);
1978         }
1979
1980         for(i=0; i<6; i++){
1981             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1982                      &run_len [i][0], 1, 1,
1983                      &run_bits[i][0], 1, 1, 1);
1984         }
1985         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1986                  &run_len [6][0], 1, 1,
1987                  &run_bits[6][0], 1, 1, 1);
1988     }
1989 }
1990
1991 static void free_tables(H264Context *h){
1992     int i;
1993     H264Context *hx;
1994     av_freep(&h->intra4x4_pred_mode);
1995     av_freep(&h->chroma_pred_mode_table);
1996     av_freep(&h->cbp_table);
1997     av_freep(&h->mvd_table[0]);
1998     av_freep(&h->mvd_table[1]);
1999     av_freep(&h->direct_table);
2000     av_freep(&h->non_zero_count);
2001     av_freep(&h->slice_table_base);
2002     h->slice_table= NULL;
2003
2004     av_freep(&h->mb2b_xy);
2005     av_freep(&h->mb2b8_xy);
2006
2007     for(i = 0; i < MAX_SPS_COUNT; i++)
2008         av_freep(h->sps_buffers + i);
2009
2010     for(i = 0; i < MAX_PPS_COUNT; i++)
2011         av_freep(h->pps_buffers + i);
2012
2013     for(i = 0; i < h->s.avctx->thread_count; i++) {
2014         hx = h->thread_context[i];
2015         if(!hx) continue;
2016         av_freep(&hx->top_borders[1]);
2017         av_freep(&hx->top_borders[0]);
2018         av_freep(&hx->s.obmc_scratchpad);
2019     }
2020 }
2021
2022 static void init_dequant8_coeff_table(H264Context *h){
2023     int i,q,x;
2024     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2025     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2026     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2027
2028     for(i=0; i<2; i++ ){
2029         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2030             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2031             break;
2032         }
2033
2034         for(q=0; q<52; q++){
2035             int shift = ff_div6[q];
2036             int idx = ff_rem6[q];
2037             for(x=0; x<64; x++)
2038                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2039                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2040                     h->pps.scaling_matrix8[i][x]) << shift;
2041         }
2042     }
2043 }
2044
2045 static void init_dequant4_coeff_table(H264Context *h){
2046     int i,j,q,x;
2047     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2048     for(i=0; i<6; i++ ){
2049         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2050         for(j=0; j<i; j++){
2051             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2052                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2053                 break;
2054             }
2055         }
2056         if(j<i)
2057             continue;
2058
2059         for(q=0; q<52; q++){
2060             int shift = ff_div6[q] + 2;
2061             int idx = ff_rem6[q];
2062             for(x=0; x<16; x++)
2063                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2064                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2065                     h->pps.scaling_matrix4[i][x]) << shift;
2066         }
2067     }
2068 }
2069
2070 static void init_dequant_tables(H264Context *h){
2071     int i,x;
2072     init_dequant4_coeff_table(h);
2073     if(h->pps.transform_8x8_mode)
2074         init_dequant8_coeff_table(h);
2075     if(h->sps.transform_bypass){
2076         for(i=0; i<6; i++)
2077             for(x=0; x<16; x++)
2078                 h->dequant4_coeff[i][0][x] = 1<<6;
2079         if(h->pps.transform_8x8_mode)
2080             for(i=0; i<2; i++)
2081                 for(x=0; x<64; x++)
2082                     h->dequant8_coeff[i][0][x] = 1<<6;
2083     }
2084 }
2085
2086
2087 /**
2088  * allocates tables.
2089  * needs width/height
2090  */
2091 static int alloc_tables(H264Context *h){
2092     MpegEncContext * const s = &h->s;
2093     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2094     int x,y;
2095
2096     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2097
2098     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2099     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2101
2102     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2104     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2106
2107     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2108     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2109
2110     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2111     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2112     for(y=0; y<s->mb_height; y++){
2113         for(x=0; x<s->mb_width; x++){
2114             const int mb_xy= x + y*s->mb_stride;
2115             const int b_xy = 4*x + 4*y*h->b_stride;
2116             const int b8_xy= 2*x + 2*y*h->b8_stride;
2117
2118             h->mb2b_xy [mb_xy]= b_xy;
2119             h->mb2b8_xy[mb_xy]= b8_xy;
2120         }
2121     }
2122
2123     s->obmc_scratchpad = NULL;
2124
2125     if(!h->dequant4_coeff[0])
2126         init_dequant_tables(h);
2127
2128     return 0;
2129 fail:
2130     free_tables(h);
2131     return -1;
2132 }
2133
2134 /**
2135  * Mimic alloc_tables(), but for every context thread.
2136  */
2137 static void clone_tables(H264Context *dst, H264Context *src){
2138     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2139     dst->non_zero_count           = src->non_zero_count;
2140     dst->slice_table              = src->slice_table;
2141     dst->cbp_table                = src->cbp_table;
2142     dst->mb2b_xy                  = src->mb2b_xy;
2143     dst->mb2b8_xy                 = src->mb2b8_xy;
2144     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2145     dst->mvd_table[0]             = src->mvd_table[0];
2146     dst->mvd_table[1]             = src->mvd_table[1];
2147     dst->direct_table             = src->direct_table;
2148
2149     dst->s.obmc_scratchpad = NULL;
2150     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2151 }
2152
2153 /**
2154  * Init context
2155  * Allocate buffers which are not shared amongst multiple threads.
2156  */
2157 static int context_init(H264Context *h){
2158     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2159     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160
2161     return 0;
2162 fail:
2163     return -1; // free_tables will clean up for us
2164 }
2165
2166 static av_cold void common_init(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168
2169     s->width = s->avctx->width;
2170     s->height = s->avctx->height;
2171     s->codec_id= s->avctx->codec->id;
2172
2173     ff_h264_pred_init(&h->hpc, s->codec_id);
2174
2175     h->dequant_coeff_pps= -1;
2176     s->unrestricted_mv=1;
2177     s->decode=1; //FIXME
2178
2179     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2180     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2181 }
2182
2183 static av_cold int decode_init(AVCodecContext *avctx){
2184     H264Context *h= avctx->priv_data;
2185     MpegEncContext * const s = &h->s;
2186
2187     MPV_decode_defaults(s);
2188
2189     s->avctx = avctx;
2190     common_init(h);
2191
2192     s->out_format = FMT_H264;
2193     s->workaround_bugs= avctx->workaround_bugs;
2194
2195     // set defaults
2196 //    s->decode_mb= ff_h263_decode_mb;
2197     s->quarter_sample = 1;
2198     s->low_delay= 1;
2199
2200     if(avctx->codec_id == CODEC_ID_SVQ3)
2201         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2202     else
2203         avctx->pix_fmt= PIX_FMT_YUV420P;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     return 0;
2217 }
2218
2219 static int frame_start(H264Context *h){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222
2223     if(MPV_frame_start(s, s->avctx) < 0)
2224         return -1;
2225     ff_er_frame_start(s);
2226     /*
2227      * MPV_frame_start uses pict_type to derive key_frame.
2228      * This is incorrect for H.264; IDR markings must be used.
2229      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2230      * See decode_nal_units().
2231      */
2232     s->current_picture_ptr->key_frame= 0;
2233
2234     assert(s->linesize && s->uvlinesize);
2235
2236     for(i=0; i<16; i++){
2237         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2239     }
2240     for(i=0; i<4; i++){
2241         h->block_offset[16+i]=
2242         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243         h->block_offset[24+16+i]=
2244         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245     }
2246
2247     /* can't be in alloc_tables because linesize isn't known there.
2248      * FIXME: redo bipred weight to not require extra buffer? */
2249     for(i = 0; i < s->avctx->thread_count; i++)
2250         if(!h->thread_context[i]->s.obmc_scratchpad)
2251             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2252
2253     /* some macroblocks will be accessed before they're available */
2254     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2255         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2256
2257 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2258
2259     // We mark the current picture as non-reference after allocating it, so
2260     // that if we break out due to an error it can be released automatically
2261     // in the next MPV_frame_start().
2262     // SVQ3 as well as most other codecs have only last/next/current and thus
2263     // get released even with set reference, besides SVQ3 and others do not
2264     // mark frames as reference later "naturally".
2265     if(s->codec_id != CODEC_ID_SVQ3)
2266         s->current_picture_ptr->reference= 0;
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273
2274     src_y  -=   linesize;
2275     src_cb -= uvlinesize;
2276     src_cr -= uvlinesize;
2277
2278     // There are two lines saved, the line above the the top macroblock of a pair,
2279     // and the line above the bottom macroblock
2280     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2281     for(i=1; i<17; i++){
2282         h->left_border[i]= src_y[15+i*  linesize];
2283     }
2284
2285     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2286     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2287
2288     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2289         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2290         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2291         for(i=1; i<9; i++){
2292             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2293             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2294         }
2295         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2296         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2297     }
2298 }
2299
2300 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2301     MpegEncContext * const s = &h->s;
2302     int temp8, i;
2303     uint64_t temp64;
2304     int deblock_left;
2305     int deblock_top;
2306     int mb_xy;
2307
2308     if(h->deblocking_filter == 2) {
2309         mb_xy = h->mb_xy;
2310         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2311         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2312     } else {
2313         deblock_left = (s->mb_x > 0);
2314         deblock_top =  (s->mb_y > 0);
2315     }
2316
2317     src_y  -=   linesize + 1;
2318     src_cb -= uvlinesize + 1;
2319     src_cr -= uvlinesize + 1;
2320
2321 #define XCHG(a,b,t,xchg)\
2322 t= a;\
2323 if(xchg)\
2324     a= b;\
2325 b= t;
2326
2327     if(deblock_left){
2328         for(i = !deblock_top; i<17; i++){
2329             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2330         }
2331     }
2332
2333     if(deblock_top){
2334         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2335         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2336         if(s->mb_x+1 < s->mb_width){
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2338         }
2339     }
2340
2341     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2342         if(deblock_left){
2343             for(i = !deblock_top; i<9; i++){
2344                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2345                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2346             }
2347         }
2348         if(deblock_top){
2349             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2350             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2351         }
2352     }
2353 }
2354
2355 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2356     MpegEncContext * const s = &h->s;
2357     int i;
2358
2359     src_y  -= 2 *   linesize;
2360     src_cb -= 2 * uvlinesize;
2361     src_cr -= 2 * uvlinesize;
2362
2363     // There are two lines saved, the line above the the top macroblock of a pair,
2364     // and the line above the bottom macroblock
2365     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2366     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2367     for(i=2; i<34; i++){
2368         h->left_border[i]= src_y[15+i*  linesize];
2369     }
2370
2371     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2372     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2373     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2374     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2375
2376     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2377         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2378         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2379         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2380         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2381         for(i=2; i<18; i++){
2382             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2383             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2384         }
2385         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2386         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2387         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2388         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2389     }
2390 }
2391
2392 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2393     MpegEncContext * const s = &h->s;
2394     int temp8, i;
2395     uint64_t temp64;
2396     int deblock_left = (s->mb_x > 0);
2397     int deblock_top  = (s->mb_y > 1);
2398
2399     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2400
2401     src_y  -= 2 *   linesize + 1;
2402     src_cb -= 2 * uvlinesize + 1;
2403     src_cr -= 2 * uvlinesize + 1;
2404
2405 #define XCHG(a,b,t,xchg)\
2406 t= a;\
2407 if(xchg)\
2408     a= b;\
2409 b= t;
2410
2411     if(deblock_left){
2412         for(i = (!deblock_top)<<1; i<34; i++){
2413             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2414         }
2415     }
2416
2417     if(deblock_top){
2418         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2419         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2420         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2421         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2422         if(s->mb_x+1 < s->mb_width){
2423             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2424             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2425         }
2426     }
2427
2428     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2429         if(deblock_left){
2430             for(i = (!deblock_top) << 1; i<18; i++){
2431                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2432                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2433             }
2434         }
2435         if(deblock_top){
2436             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2437             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2438             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2439             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2440         }
2441     }
2442 }
2443
2444 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2445     MpegEncContext * const s = &h->s;
2446     const int mb_x= s->mb_x;
2447     const int mb_y= s->mb_y;
2448     const int mb_xy= h->mb_xy;
2449     const int mb_type= s->current_picture.mb_type[mb_xy];
2450     uint8_t  *dest_y, *dest_cb, *dest_cr;
2451     int linesize, uvlinesize /*dct_offset*/;
2452     int i;
2453     int *block_offset = &h->block_offset[0];
2454     const unsigned int bottom = mb_y & 1;
2455     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2456     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2457     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2458
2459     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2460     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2461     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2462
2463     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2464     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2465
2466     if (!simple && MB_FIELD) {
2467         linesize   = h->mb_linesize   = s->linesize * 2;
2468         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2469         block_offset = &h->block_offset[24];
2470         if(mb_y&1){ //FIXME move out of this function?
2471             dest_y -= s->linesize*15;
2472             dest_cb-= s->uvlinesize*7;
2473             dest_cr-= s->uvlinesize*7;
2474         }
2475         if(FRAME_MBAFF) {
2476             int list;
2477             for(list=0; list<h->list_count; list++){
2478                 if(!USES_LIST(mb_type, list))
2479                     continue;
2480                 if(IS_16X16(mb_type)){
2481                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2482                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2483                 }else{
2484                     for(i=0; i<16; i+=4){
2485                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2486                         int ref = h->ref_cache[list][scan8[i]];
2487                         if(ref >= 0)
2488                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2489                     }
2490                 }
2491             }
2492         }
2493     } else {
2494         linesize   = h->mb_linesize   = s->linesize;
2495         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2496 //        dct_offset = s->linesize * 16;
2497     }
2498
2499     if(transform_bypass){
2500         idct_dc_add =
2501         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2502     }else if(IS_8x8DCT(mb_type)){
2503         idct_dc_add = s->dsp.h264_idct8_dc_add;
2504         idct_add = s->dsp.h264_idct8_add;
2505     }else{
2506         idct_dc_add = s->dsp.h264_idct_dc_add;
2507         idct_add = s->dsp.h264_idct_add;
2508     }
2509
2510     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2511        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2512         int mbt_y = mb_y&~1;
2513         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2514         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2515         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2516         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2517     }
2518
2519     if (!simple && IS_INTRA_PCM(mb_type)) {
2520         unsigned int x, y;
2521
2522         // The pixels are stored in h->mb array in the same order as levels,
2523         // copy them in output in the correct order.
2524         for(i=0; i<16; i++) {
2525             for (y=0; y<4; y++) {
2526                 for (x=0; x<4; x++) {
2527                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2528                 }
2529             }
2530         }
2531         for(i=16; i<16+4; i++) {
2532             for (y=0; y<4; y++) {
2533                 for (x=0; x<4; x++) {
2534                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2535                 }
2536             }
2537         }
2538         for(i=20; i<20+4; i++) {
2539             for (y=0; y<4; y++) {
2540                 for (x=0; x<4; x++) {
2541                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2542                 }
2543             }
2544         }
2545     } else {
2546         if(IS_INTRA(mb_type)){
2547             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2548                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2549
2550             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2551                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2552                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2553             }
2554
2555             if(IS_INTRA4x4(mb_type)){
2556                 if(simple || !s->encoding){
2557                     if(IS_8x8DCT(mb_type)){
2558                         for(i=0; i<16; i+=4){
2559                             uint8_t * const ptr= dest_y + block_offset[i];
2560                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2561                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2562                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2563                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2564                             if(nnz){
2565                                 if(nnz == 1 && h->mb[i*16])
2566                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2567                                 else
2568                                     idct_add(ptr, h->mb + i*16, linesize);
2569                             }
2570                         }
2571                     }else
2572                     for(i=0; i<16; i++){
2573                         uint8_t * const ptr= dest_y + block_offset[i];
2574                         uint8_t *topright;
2575                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2576                         int nnz, tr;
2577
2578                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2579                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2580                             assert(mb_y || linesize <= block_offset[i]);
2581                             if(!topright_avail){
2582                                 tr= ptr[3 - linesize]*0x01010101;
2583                                 topright= (uint8_t*) &tr;
2584                             }else
2585                                 topright= ptr + 4 - linesize;
2586                         }else
2587                             topright= NULL;
2588
2589                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2590                         nnz = h->non_zero_count_cache[ scan8[i] ];
2591                         if(nnz){
2592                             if(is_h264){
2593                                 if(nnz == 1 && h->mb[i*16])
2594                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2595                                 else
2596                                     idct_add(ptr, h->mb + i*16, linesize);
2597                             }else
2598                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2599                         }
2600                     }
2601                 }
2602             }else{
2603                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2604                 if(is_h264){
2605                     if(!transform_bypass)
2606                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2607                 }else
2608                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2609             }
2610             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2611                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2612         }else if(is_h264){
2613             hl_motion(h, dest_y, dest_cb, dest_cr,
2614                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2615                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2616                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2617         }
2618
2619
2620         if(!IS_INTRA4x4(mb_type)){
2621             if(is_h264){
2622                 if(IS_INTRA16x16(mb_type)){
2623                     for(i=0; i<16; i++){
2624                         if(h->non_zero_count_cache[ scan8[i] ])
2625                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2626                         else if(h->mb[i*16])
2627                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2628                     }
2629                 }else{
2630                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2631                     for(i=0; i<16; i+=di){
2632                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2633                         if(nnz){
2634                             if(nnz==1 && h->mb[i*16])
2635                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2636                             else
2637                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2638                         }
2639                     }
2640                 }
2641             }else{
2642                 for(i=0; i<16; i++){
2643                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2644                         uint8_t * const ptr= dest_y + block_offset[i];
2645                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2646                     }
2647                 }
2648             }
2649         }
2650
2651         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2652             uint8_t *dest[2] = {dest_cb, dest_cr};
2653             if(transform_bypass){
2654                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2655             }else{
2656                 idct_add = s->dsp.h264_idct_add;
2657                 idct_dc_add = s->dsp.h264_idct_dc_add;
2658                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2659                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2660             }
2661             if(is_h264){
2662                 for(i=16; i<16+8; i++){
2663                     if(h->non_zero_count_cache[ scan8[i] ])
2664                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2665                     else if(h->mb[i*16])
2666                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2667                 }
2668             }else{
2669                 for(i=16; i<16+8; i++){
2670                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2671                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2672                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2673                     }
2674                 }
2675             }
2676         }
2677     }
2678     if(h->deblocking_filter) {
2679         if (!simple && FRAME_MBAFF) {
2680             //FIXME try deblocking one mb at a time?
2681             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2682             const int mb_y = s->mb_y - 1;
2683             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2684             const int mb_xy= mb_x + mb_y*s->mb_stride;
2685             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2686             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2687             if (!bottom) return;
2688             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2689             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2690             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2691
2692             if(IS_INTRA(mb_type_top | mb_type_bottom))
2693                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2694
2695             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2696             // deblock a pair
2697             // top
2698             s->mb_y--; h->mb_xy -= s->mb_stride;
2699             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2700             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2701             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2702             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2703             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2704             // bottom
2705             s->mb_y++; h->mb_xy += s->mb_stride;
2706             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2707             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2708             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2709             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2710             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2711         } else {
2712             tprintf(h->s.avctx, "call filter_mb\n");
2713             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2714             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2715             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2716             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2717             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2718         }
2719     }
2720 }
2721
2722 /**
2723  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2724  */
2725 static void hl_decode_mb_simple(H264Context *h){
2726     hl_decode_mb_internal(h, 1);
2727 }
2728
2729 /**
2730  * Process a macroblock; this handles edge cases, such as interlacing.
2731  */
2732 static void av_noinline hl_decode_mb_complex(H264Context *h){
2733     hl_decode_mb_internal(h, 0);
2734 }
2735
2736 static void hl_decode_mb(H264Context *h){
2737     MpegEncContext * const s = &h->s;
2738     const int mb_xy= h->mb_xy;
2739     const int mb_type= s->current_picture.mb_type[mb_xy];
2740     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2741                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2742
2743     if(ENABLE_H264_ENCODER && !s->decode)
2744         return;
2745
2746     if (is_complex)
2747         hl_decode_mb_complex(h);
2748     else hl_decode_mb_simple(h);
2749 }
2750
2751 static void pic_as_field(Picture *pic, const int parity){
2752     int i;
2753     for (i = 0; i < 4; ++i) {
2754         if (parity == PICT_BOTTOM_FIELD)
2755             pic->data[i] += pic->linesize[i];
2756         pic->reference = parity;
2757         pic->linesize[i] *= 2;
2758     }
2759 }
2760
2761 static int split_field_copy(Picture *dest, Picture *src,
2762                             int parity, int id_add){
2763     int match = !!(src->reference & parity);
2764
2765     if (match) {
2766         *dest = *src;
2767         pic_as_field(dest, parity);
2768         dest->pic_id *= 2;
2769         dest->pic_id += id_add;
2770     }
2771
2772     return match;
2773 }
2774
2775 /**
2776  * Split one reference list into field parts, interleaving by parity
2777  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2778  * set to look at the actual start of data for that field.
2779  *
2780  * @param dest output list
2781  * @param dest_len maximum number of fields to put in dest
2782  * @param src the source reference list containing fields and/or field pairs
2783  *            (aka short_ref/long_ref, or
2784  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2785  * @param src_len number of Picture's in source (pairs and unmatched fields)
2786  * @param parity the parity of the picture being decoded/needing
2787  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2788  * @return number of fields placed in dest
2789  */
2790 static int split_field_half_ref_list(Picture *dest, int dest_len,
2791                                      Picture *src,  int src_len,  int parity){
2792     int same_parity   = 1;
2793     int same_i        = 0;
2794     int opp_i         = 0;
2795     int out_i;
2796     int field_output;
2797
2798     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2799         if (same_parity && same_i < src_len) {
2800             field_output = split_field_copy(dest + out_i, src + same_i,
2801                                             parity, 1);
2802             same_parity = !field_output;
2803             same_i++;
2804
2805         } else if (opp_i < src_len) {
2806             field_output = split_field_copy(dest + out_i, src + opp_i,
2807                                             PICT_FRAME - parity, 0);
2808             same_parity = field_output;
2809             opp_i++;
2810
2811         } else {
2812             break;
2813         }
2814     }
2815
2816     return out_i;
2817 }
2818
2819 /**
2820  * Split the reference frame list into a reference field list.
2821  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2822  * The input list contains both reference field pairs and
2823  * unmatched reference fields; it is ordered as spec describes
2824  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2825  * unmatched field pairs are also present. Conceptually this is equivalent
2826  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2827  *
2828  * @param dest output reference list where ordered fields are to be placed
2829  * @param dest_len max number of fields to place at dest
2830  * @param src source reference list, as described above
2831  * @param src_len number of pictures (pairs and unmatched fields) in src
2832  * @param parity parity of field being currently decoded
2833  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2834  * @param long_i index into src array that holds first long reference picture,
2835  *        or src_len if no long refs present.
2836  */
2837 static int split_field_ref_list(Picture *dest, int dest_len,
2838                                 Picture *src,  int src_len,
2839                                 int parity,    int long_i){
2840
2841     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2842     dest += i;
2843     dest_len -= i;
2844
2845     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2846                                    src_len - long_i, parity);
2847     return i;
2848 }
2849
2850 /**
2851  * fills the default_ref_list.
2852  */
2853 static int fill_default_ref_list(H264Context *h){
2854     MpegEncContext * const s = &h->s;
2855     int i;
2856     int smallest_poc_greater_than_current = -1;
2857     int structure_sel;
2858     Picture sorted_short_ref[32];
2859     Picture field_entry_list[2][32];
2860     Picture *frame_list[2];
2861
2862     if (FIELD_PICTURE) {
2863         structure_sel = PICT_FRAME;
2864         frame_list[0] = field_entry_list[0];
2865         frame_list[1] = field_entry_list[1];
2866     } else {
2867         structure_sel = 0;
2868         frame_list[0] = h->default_ref_list[0];
2869         frame_list[1] = h->default_ref_list[1];
2870     }
2871
2872     if(h->slice_type_nos==FF_B_TYPE){
2873         int list;
2874         int len[2];
2875         int short_len[2];
2876         int out_i;
2877         int limit= INT_MIN;
2878
2879         /* sort frame according to POC in B slice */
2880         for(out_i=0; out_i<h->short_ref_count; out_i++){
2881             int best_i=INT_MIN;
2882             int best_poc=INT_MAX;
2883
2884             for(i=0; i<h->short_ref_count; i++){
2885                 const int poc= h->short_ref[i]->poc;
2886                 if(poc > limit && poc < best_poc){
2887                     best_poc= poc;
2888                     best_i= i;
2889                 }
2890             }
2891
2892             assert(best_i != INT_MIN);
2893
2894             limit= best_poc;
2895             sorted_short_ref[out_i]= *h->short_ref[best_i];
2896             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2897             if (-1 == smallest_poc_greater_than_current) {
2898                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2899                     smallest_poc_greater_than_current = out_i;
2900                 }
2901             }
2902         }
2903
2904         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2905
2906         // find the largest POC
2907         for(list=0; list<2; list++){
2908             int index = 0;
2909             int j= -99;
2910             int step= list ? -1 : 1;
2911
2912             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2913                 int sel;
2914                 while(j<0 || j>= h->short_ref_count){
2915                     if(j != -99 && step == (list ? -1 : 1))
2916                         return -1;
2917                     step = -step;
2918                     j= smallest_poc_greater_than_current + (step>>1);
2919                 }
2920                 sel = sorted_short_ref[j].reference | structure_sel;
2921                 if(sel != PICT_FRAME) continue;
2922                 frame_list[list][index  ]= sorted_short_ref[j];
2923                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2924             }
2925             short_len[list] = index;
2926
2927             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2928                 int sel;
2929                 if(h->long_ref[i] == NULL) continue;
2930                 sel = h->long_ref[i]->reference | structure_sel;
2931                 if(sel != PICT_FRAME) continue;
2932
2933                 frame_list[ list ][index  ]= *h->long_ref[i];
2934                 frame_list[ list ][index++].pic_id= i;
2935             }
2936             len[list] = index;
2937         }
2938
2939         for(list=0; list<2; list++){
2940             if (FIELD_PICTURE)
2941                 len[list] = split_field_ref_list(h->default_ref_list[list],
2942                                                  h->ref_count[list],
2943                                                  frame_list[list],
2944                                                  len[list],
2945                                                  s->picture_structure,
2946                                                  short_len[list]);
2947
2948             // swap the two first elements of L1 when L0 and L1 are identical
2949             if(list && len[0] > 1 && len[0] == len[1])
2950                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2951                     if(i == len[0]){
2952                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2953                         break;
2954                     }
2955
2956             if(len[list] < h->ref_count[ list ])
2957                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2958         }
2959
2960
2961     }else{
2962         int index=0;
2963         int short_len;
2964         for(i=0; i<h->short_ref_count; i++){
2965             int sel;
2966             sel = h->short_ref[i]->reference | structure_sel;
2967             if(sel != PICT_FRAME) continue;
2968             frame_list[0][index  ]= *h->short_ref[i];
2969             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2970         }
2971         short_len = index;
2972         for(i = 0; i < 16; i++){
2973             int sel;
2974             if(h->long_ref[i] == NULL) continue;
2975             sel = h->long_ref[i]->reference | structure_sel;
2976             if(sel != PICT_FRAME) continue;
2977             frame_list[0][index  ]= *h->long_ref[i];
2978             frame_list[0][index++].pic_id= i;
2979         }
2980
2981         if (FIELD_PICTURE)
2982             index = split_field_ref_list(h->default_ref_list[0],
2983                                          h->ref_count[0], frame_list[0],
2984                                          index, s->picture_structure,
2985                                          short_len);
2986
2987         if(index < h->ref_count[0])
2988             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2989     }
2990 #ifdef TRACE
2991     for (i=0; i<h->ref_count[0]; i++) {
2992         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2993     }
2994     if(h->slice_type_nos==FF_B_TYPE){
2995         for (i=0; i<h->ref_count[1]; i++) {
2996             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2997         }
2998     }
2999 #endif
3000     return 0;
3001 }
3002
3003 static void print_short_term(H264Context *h);
3004 static void print_long_term(H264Context *h);
3005
3006 /**
3007  * Extract structure information about the picture described by pic_num in
3008  * the current decoding context (frame or field). Note that pic_num is
3009  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3010  * @param pic_num picture number for which to extract structure information
3011  * @param structure one of PICT_XXX describing structure of picture
3012  *                      with pic_num
3013  * @return frame number (short term) or long term index of picture
3014  *         described by pic_num
3015  */
3016 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3017     MpegEncContext * const s = &h->s;
3018
3019     *structure = s->picture_structure;
3020     if(FIELD_PICTURE){
3021         if (!(pic_num & 1))
3022             /* opposite field */
3023             *structure ^= PICT_FRAME;
3024         pic_num >>= 1;
3025     }
3026
3027     return pic_num;
3028 }
3029
3030 static int decode_ref_pic_list_reordering(H264Context *h){
3031     MpegEncContext * const s = &h->s;
3032     int list, index, pic_structure;
3033
3034     print_short_term(h);
3035     print_long_term(h);
3036     if(h->slice_type_nos==FF_I_TYPE) return 0; //FIXME move before function
3037
3038     for(list=0; list<h->list_count; list++){
3039         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3040
3041         if(get_bits1(&s->gb)){
3042             int pred= h->curr_pic_num;
3043
3044             for(index=0; ; index++){
3045                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3046                 unsigned int pic_id;
3047                 int i;
3048                 Picture *ref = NULL;
3049
3050                 if(reordering_of_pic_nums_idc==3)
3051                     break;
3052
3053                 if(index >= h->ref_count[list]){
3054                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3055                     return -1;
3056                 }
3057
3058                 if(reordering_of_pic_nums_idc<3){
3059                     if(reordering_of_pic_nums_idc<2){
3060                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3061                         int frame_num;
3062
3063                         if(abs_diff_pic_num > h->max_pic_num){
3064                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3065                             return -1;
3066                         }
3067
3068                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3069                         else                                pred+= abs_diff_pic_num;
3070                         pred &= h->max_pic_num - 1;
3071
3072                         frame_num = pic_num_extract(h, pred, &pic_structure);
3073
3074                         for(i= h->short_ref_count-1; i>=0; i--){
3075                             ref = h->short_ref[i];
3076                             assert(ref->reference);
3077                             assert(!ref->long_ref);
3078                             if(ref->data[0] != NULL &&
3079                                    ref->frame_num == frame_num &&
3080                                    (ref->reference & pic_structure) &&
3081                                    ref->long_ref == 0) // ignore non-existing pictures by testing data[0] pointer
3082                                 break;
3083                         }
3084                         if(i>=0)
3085                             ref->pic_id= pred;
3086                     }else{
3087                         int long_idx;
3088                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3089
3090                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3091
3092                         if(long_idx>31){
3093                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3094                             return -1;
3095                         }
3096                         ref = h->long_ref[long_idx];
3097                         assert(!(ref && !ref->reference));
3098                         if(ref && (ref->reference & pic_structure)){
3099                             ref->pic_id= pic_id;
3100                             assert(ref->long_ref);
3101                             i=0;
3102                         }else{
3103                             i=-1;
3104                         }
3105                     }
3106
3107                     if (i < 0) {
3108                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3109                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3110                     } else {
3111                         for(i=index; i+1<h->ref_count[list]; i++){
3112                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3113                                 break;
3114                         }
3115                         for(; i > index; i--){
3116                             h->ref_list[list][i]= h->ref_list[list][i-1];
3117                         }
3118                         h->ref_list[list][index]= *ref;
3119                         if (FIELD_PICTURE){
3120                             pic_as_field(&h->ref_list[list][index], pic_structure);
3121                         }
3122                     }
3123                 }else{
3124                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3125                     return -1;
3126                 }
3127             }
3128         }
3129     }
3130     for(list=0; list<h->list_count; list++){
3131         for(index= 0; index < h->ref_count[list]; index++){
3132             if(!h->ref_list[list][index].data[0])
3133                 h->ref_list[list][index]= s->current_picture;
3134         }
3135     }
3136
3137     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3138         direct_dist_scale_factor(h);
3139     direct_ref_list_init(h);
3140     return 0;
3141 }
3142
3143 static void fill_mbaff_ref_list(H264Context *h){
3144     int list, i, j;
3145     for(list=0; list<2; list++){ //FIXME try list_count
3146         for(i=0; i<h->ref_count[list]; i++){
3147             Picture *frame = &h->ref_list[list][i];
3148             Picture *field = &h->ref_list[list][16+2*i];
3149             field[0] = *frame;
3150             for(j=0; j<3; j++)
3151                 field[0].linesize[j] <<= 1;
3152             field[0].reference = PICT_TOP_FIELD;
3153             field[1] = field[0];
3154             for(j=0; j<3; j++)
3155                 field[1].data[j] += frame->linesize[j];
3156             field[1].reference = PICT_BOTTOM_FIELD;
3157
3158             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3159             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3160             for(j=0; j<2; j++){
3161                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3162                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3163             }
3164         }
3165     }
3166     for(j=0; j<h->ref_count[1]; j++){
3167         for(i=0; i<h->ref_count[0]; i++)
3168             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3169         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3170         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3171     }
3172 }
3173
3174 static int pred_weight_table(H264Context *h){
3175     MpegEncContext * const s = &h->s;
3176     int list, i;
3177     int luma_def, chroma_def;
3178
3179     h->use_weight= 0;
3180     h->use_weight_chroma= 0;
3181     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3182     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3183     luma_def = 1<<h->luma_log2_weight_denom;
3184     chroma_def = 1<<h->chroma_log2_weight_denom;
3185
3186     for(list=0; list<2; list++){
3187         for(i=0; i<h->ref_count[list]; i++){
3188             int luma_weight_flag, chroma_weight_flag;
3189
3190             luma_weight_flag= get_bits1(&s->gb);
3191             if(luma_weight_flag){
3192                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3193                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3194                 if(   h->luma_weight[list][i] != luma_def
3195                    || h->luma_offset[list][i] != 0)
3196                     h->use_weight= 1;
3197             }else{
3198                 h->luma_weight[list][i]= luma_def;
3199                 h->luma_offset[list][i]= 0;
3200             }
3201
3202             chroma_weight_flag= get_bits1(&s->gb);
3203             if(chroma_weight_flag){
3204                 int j;
3205                 for(j=0; j<2; j++){
3206                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3207                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3208                     if(   h->chroma_weight[list][i][j] != chroma_def
3209                        || h->chroma_offset[list][i][j] != 0)
3210                         h->use_weight_chroma= 1;
3211                 }
3212             }else{
3213                 int j;
3214                 for(j=0; j<2; j++){
3215                     h->chroma_weight[list][i][j]= chroma_def;
3216                     h->chroma_offset[list][i][j]= 0;
3217                 }
3218             }
3219         }
3220         if(h->slice_type_nos != FF_B_TYPE) break;
3221     }
3222     h->use_weight= h->use_weight || h->use_weight_chroma;
3223     return 0;
3224 }
3225
3226 static void implicit_weight_table(H264Context *h){
3227     MpegEncContext * const s = &h->s;
3228     int ref0, ref1;
3229     int cur_poc = s->current_picture_ptr->poc;
3230
3231     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3232        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3233         h->use_weight= 0;
3234         h->use_weight_chroma= 0;
3235         return;
3236     }
3237
3238     h->use_weight= 2;
3239     h->use_weight_chroma= 2;
3240     h->luma_log2_weight_denom= 5;
3241     h->chroma_log2_weight_denom= 5;
3242
3243     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3244         int poc0 = h->ref_list[0][ref0].poc;
3245         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3246             int poc1 = h->ref_list[1][ref1].poc;
3247             int td = av_clip(poc1 - poc0, -128, 127);
3248             if(td){
3249                 int tb = av_clip(cur_poc - poc0, -128, 127);
3250                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3251                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3252                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3253                     h->implicit_weight[ref0][ref1] = 32;
3254                 else
3255                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3256             }else
3257                 h->implicit_weight[ref0][ref1] = 32;
3258         }
3259     }
3260 }
3261
3262 /**
3263  * Mark a picture as no longer needed for reference. The refmask
3264  * argument allows unreferencing of individual fields or the whole frame.
3265  * If the picture becomes entirely unreferenced, but is being held for
3266  * display purposes, it is marked as such.
3267  * @param refmask mask of fields to unreference; the mask is bitwise
3268  *                anded with the reference marking of pic
3269  * @return non-zero if pic becomes entirely unreferenced (except possibly
3270  *         for display purposes) zero if one of the fields remains in
3271  *         reference
3272  */
3273 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3274     int i;
3275     if (pic->reference &= refmask) {
3276         return 0;
3277     } else {
3278         for(i = 0; h->delayed_pic[i]; i++)
3279             if(pic == h->delayed_pic[i]){
3280                 pic->reference=DELAYED_PIC_REF;
3281                 break;
3282             }
3283         return 1;
3284     }
3285 }
3286
3287 /**
3288  * instantaneous decoder refresh.
3289  */
3290 static void idr(H264Context *h){
3291     int i;
3292
3293     for(i=0; i<16; i++){
3294         if (h->long_ref[i] != NULL) {
3295             unreference_pic(h, h->long_ref[i], 0);
3296             h->long_ref[i]= NULL;
3297         }
3298     }
3299     h->long_ref_count=0;
3300
3301     for(i=0; i<h->short_ref_count; i++){
3302         unreference_pic(h, h->short_ref[i], 0);
3303         h->short_ref[i]= NULL;
3304     }
3305     h->short_ref_count=0;
3306     h->prev_frame_num= 0;
3307     h->prev_frame_num_offset= 0;
3308     h->prev_poc_msb=
3309     h->prev_poc_lsb= 0;
3310 }
3311
3312 /* forget old pics after a seek */
3313 static void flush_dpb(AVCodecContext *avctx){
3314     H264Context *h= avctx->priv_data;
3315     int i;
3316     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3317         if(h->delayed_pic[i])
3318             h->delayed_pic[i]->reference= 0;
3319         h->delayed_pic[i]= NULL;
3320     }
3321     h->outputed_poc= INT_MIN;
3322     idr(h);
3323     if(h->s.current_picture_ptr)
3324         h->s.current_picture_ptr->reference= 0;
3325     h->s.first_field= 0;
3326     ff_mpeg_flush(avctx);
3327 }
3328
3329 /**
3330  * Find a Picture in the short term reference list by frame number.
3331  * @param frame_num frame number to search for
3332  * @param idx the index into h->short_ref where returned picture is found
3333  *            undefined if no picture found.
3334  * @return pointer to the found picture, or NULL if no pic with the provided
3335  *                 frame number is found
3336  */
3337 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3338     MpegEncContext * const s = &h->s;
3339     int i;
3340
3341     for(i=0; i<h->short_ref_count; i++){
3342         Picture *pic= h->short_ref[i];
3343         if(s->avctx->debug&FF_DEBUG_MMCO)
3344             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3345         if(pic->frame_num == frame_num) {
3346             *idx = i;
3347             return pic;
3348         }
3349     }
3350     return NULL;
3351 }
3352
3353 /**
3354  * Remove a picture from the short term reference list by its index in
3355  * that list.  This does no checking on the provided index; it is assumed
3356  * to be valid. Other list entries are shifted down.
3357  * @param i index into h->short_ref of picture to remove.
3358  */
3359 static void remove_short_at_index(H264Context *h, int i){
3360     assert(i >= 0 && i < h->short_ref_count);
3361     h->short_ref[i]= NULL;
3362     if (--h->short_ref_count)
3363         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3364 }
3365
3366 /**
3367  *
3368  * @return the removed picture or NULL if an error occurs
3369  */
3370 static Picture * remove_short(H264Context *h, int frame_num){
3371     MpegEncContext * const s = &h->s;
3372     Picture *pic;
3373     int i;
3374
3375     if(s->avctx->debug&FF_DEBUG_MMCO)
3376         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3377
3378     pic = find_short(h, frame_num, &i);
3379     if (pic)
3380         remove_short_at_index(h, i);
3381
3382     return pic;
3383 }
3384
3385 /**
3386  * Remove a picture from the long term reference list by its index in
3387  * that list.  This does no checking on the provided index; it is assumed
3388  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3389  * @param i index into h->long_ref of picture to remove.
3390  */
3391 static void remove_long_at_index(H264Context *h, int i){
3392     h->long_ref[i]= NULL;
3393     h->long_ref_count--;
3394 }
3395
3396 /**
3397  *
3398  * @return the removed picture or NULL if an error occurs
3399  */
3400 static Picture * remove_long(H264Context *h, int i){
3401     Picture *pic;
3402
3403     pic= h->long_ref[i];
3404     if (pic)
3405         remove_long_at_index(h, i);
3406
3407     return pic;
3408 }
3409
3410 /**
3411  * print short term list
3412  */
3413 static void print_short_term(H264Context *h) {
3414     uint32_t i;
3415     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3416         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3417         for(i=0; i<h->short_ref_count; i++){
3418             Picture *pic= h->short_ref[i];
3419             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3420         }
3421     }
3422 }
3423
3424 /**
3425  * print long term list
3426  */
3427 static void print_long_term(H264Context *h) {
3428     uint32_t i;
3429     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3430         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3431         for(i = 0; i < 16; i++){
3432             Picture *pic= h->long_ref[i];
3433             if (pic) {
3434                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3435             }
3436         }
3437     }
3438 }
3439
3440 /**
3441  * Executes the reference picture marking (memory management control operations).
3442  */
3443 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3444     MpegEncContext * const s = &h->s;
3445     int i, j;
3446     int current_ref_assigned=0;
3447     Picture *pic;
3448
3449     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3450         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3451
3452     for(i=0; i<mmco_count; i++){
3453         int structure, frame_num, unref_pic;
3454         if(s->avctx->debug&FF_DEBUG_MMCO)
3455             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3456
3457         switch(mmco[i].opcode){
3458         case MMCO_SHORT2UNUSED:
3459             if(s->avctx->debug&FF_DEBUG_MMCO)
3460                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3461             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3462             pic = find_short(h, frame_num, &j);
3463             if (pic) {
3464                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3465                     remove_short_at_index(h, j);
3466             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3467                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3468             break;
3469         case MMCO_SHORT2LONG:
3470             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3471                     h->long_ref[mmco[i].long_arg]->frame_num ==
3472                                               mmco[i].short_pic_num / 2) {
3473                 /* do nothing, we've already moved this field pair. */
3474             } else {
3475                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3476
3477                 pic= remove_long(h, mmco[i].long_arg);
3478                 if(pic) unreference_pic(h, pic, 0);
3479
3480                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3481                 if (h->long_ref[ mmco[i].long_arg ]){
3482                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3483                     h->long_ref_count++;
3484                 }
3485             }
3486             break;
3487         case MMCO_LONG2UNUSED:
3488             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3489             pic = h->long_ref[j];
3490             if (pic) {
3491                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3492                     remove_long_at_index(h, j);
3493             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3494                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3495             break;
3496         case MMCO_LONG:
3497             unref_pic = 1;
3498             if (FIELD_PICTURE && !s->first_field) {
3499                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3500                     /* Just mark second field as referenced */
3501                     unref_pic = 0;
3502                 } else if (s->current_picture_ptr->reference) {
3503                     /* First field in pair is in short term list or
3504                      * at a different long term index.
3505                      * This is not allowed; see 7.4.3, notes 2 and 3.
3506                      * Report the problem and keep the pair where it is,
3507                      * and mark this field valid.
3508                      */
3509                     av_log(h->s.avctx, AV_LOG_ERROR,
3510                         "illegal long term reference assignment for second "
3511                         "field in complementary field pair (first field is "
3512                         "short term or has non-matching long index)\n");
3513                     unref_pic = 0;
3514                 }
3515             }
3516
3517             if (unref_pic) {
3518                 pic= remove_long(h, mmco[i].long_arg);
3519                 if(pic) unreference_pic(h, pic, 0);
3520
3521                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3522                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3523                 h->long_ref_count++;
3524             }
3525
3526             s->current_picture_ptr->reference |= s->picture_structure;
3527             current_ref_assigned=1;
3528             break;
3529         case MMCO_SET_MAX_LONG:
3530             assert(mmco[i].long_arg <= 16);
3531             // just remove the long term which index is greater than new max
3532             for(j = mmco[i].long_arg; j<16; j++){
3533                 pic = remove_long(h, j);
3534                 if (pic) unreference_pic(h, pic, 0);
3535             }
3536             break;
3537         case MMCO_RESET:
3538             while(h->short_ref_count){
3539                 pic= remove_short(h, h->short_ref[0]->frame_num);
3540                 if(pic) unreference_pic(h, pic, 0);
3541             }
3542             for(j = 0; j < 16; j++) {
3543                 pic= remove_long(h, j);
3544                 if(pic) unreference_pic(h, pic, 0);
3545             }
3546             s->current_picture_ptr->poc=
3547             s->current_picture_ptr->field_poc[0]=
3548             s->current_picture_ptr->field_poc[1]=
3549             h->poc_lsb=
3550             h->poc_msb=
3551             h->frame_num=
3552             s->current_picture_ptr->frame_num= 0;
3553             break;
3554         default: assert(0);
3555         }
3556     }
3557
3558     if (!current_ref_assigned && FIELD_PICTURE &&
3559             !s->first_field && s->current_picture_ptr->reference) {
3560
3561         /* Second field of complementary field pair; the first field of
3562          * which is already referenced. If short referenced, it
3563          * should be first entry in short_ref. If not, it must exist
3564          * in long_ref; trying to put it on the short list here is an
3565          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3566          */
3567         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3568             /* Just mark the second field valid */
3569             s->current_picture_ptr->reference = PICT_FRAME;
3570         } else if (s->current_picture_ptr->long_ref) {
3571             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3572                                              "assignment for second field "
3573                                              "in complementary field pair "
3574                                              "(first field is long term)\n");
3575         } else {
3576             /*
3577              * First field in reference, but not in any sensible place on our
3578              * reference lists. This shouldn't happen unless reference
3579              * handling somewhere else is wrong.
3580              */
3581             assert(0);
3582         }
3583         current_ref_assigned = 1;
3584     }
3585
3586     if(!current_ref_assigned){
3587         pic= remove_short(h, s->current_picture_ptr->frame_num);
3588         if(pic){
3589             unreference_pic(h, pic, 0);
3590             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3591         }
3592
3593         if(h->short_ref_count)
3594             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3595
3596         h->short_ref[0]= s->current_picture_ptr;
3597         h->short_ref[0]->long_ref=0;
3598         h->short_ref_count++;
3599         s->current_picture_ptr->reference |= s->picture_structure;
3600     }
3601
3602     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3603
3604         /* We have too many reference frames, probably due to corrupted
3605          * stream. Need to discard one frame. Prevents overrun of the
3606          * short_ref and long_ref buffers.
3607          */
3608         av_log(h->s.avctx, AV_LOG_ERROR,
3609                "number of reference frames exceeds max (probably "
3610                "corrupt input), discarding one\n");
3611
3612         if (h->long_ref_count && !h->short_ref_count) {
3613             for (i = 0; i < 16; ++i)
3614                 if (h->long_ref[i])
3615                     break;
3616
3617             assert(i < 16);
3618             pic = h->long_ref[i];
3619             remove_long_at_index(h, i);
3620         } else {
3621             pic = h->short_ref[h->short_ref_count - 1];
3622             remove_short_at_index(h, h->short_ref_count - 1);
3623         }
3624         unreference_pic(h, pic, 0);
3625     }
3626
3627     print_short_term(h);
3628     print_long_term(h);
3629     return 0;
3630 }
3631
3632 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3633     MpegEncContext * const s = &h->s;
3634     int i;
3635
3636     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3637         s->broken_link= get_bits1(gb) -1;
3638         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3639         if(h->mmco[0].long_arg == -1)
3640             h->mmco_index= 0;
3641         else{
3642             h->mmco[0].opcode= MMCO_LONG;
3643             h->mmco_index= 1;
3644         }
3645     }else{
3646         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3647             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3648                 MMCOOpcode opcode= get_ue_golomb(gb);
3649
3650                 h->mmco[i].opcode= opcode;
3651                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3652                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3653 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3654                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3655                         return -1;
3656                     }*/
3657                 }
3658                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3659                     unsigned int long_arg= get_ue_golomb(gb);
3660                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3661                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3662                         return -1;
3663                     }
3664                     h->mmco[i].long_arg= long_arg;
3665                 }
3666
3667                 if(opcode > (unsigned)MMCO_LONG){
3668                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3669                     return -1;
3670                 }
3671                 if(opcode == MMCO_END)
3672                     break;
3673             }
3674             h->mmco_index= i;
3675         }else{
3676             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3677
3678             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3679                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3680                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3681                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3682                 h->mmco_index= 1;
3683                 if (FIELD_PICTURE) {
3684                     h->mmco[0].short_pic_num *= 2;
3685                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3686                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3687                     h->mmco_index= 2;
3688                 }
3689             }else
3690                 h->mmco_index= 0;
3691         }
3692     }
3693
3694     return 0;
3695 }
3696
3697 static int init_poc(H264Context *h){
3698     MpegEncContext * const s = &h->s;
3699     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3700     int field_poc[2];
3701
3702         if(h->frame_num < h->prev_frame_num)
3703             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3704         else
3705             h->frame_num_offset= h->prev_frame_num_offset;
3706
3707     if(h->sps.poc_type==0){
3708         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3709
3710         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3711             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3712         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3713             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3714         else
3715             h->poc_msb = h->prev_poc_msb;
3716 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3717         field_poc[0] =
3718         field_poc[1] = h->poc_msb + h->poc_lsb;
3719         if(s->picture_structure == PICT_FRAME)
3720             field_poc[1] += h->delta_poc_bottom;
3721     }else if(h->sps.poc_type==1){
3722         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3723         int i;
3724
3725         if(h->sps.poc_cycle_length != 0)
3726             abs_frame_num = h->frame_num_offset + h->frame_num;
3727         else
3728             abs_frame_num = 0;
3729
3730         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3731             abs_frame_num--;
3732
3733         expected_delta_per_poc_cycle = 0;
3734         for(i=0; i < h->sps.poc_cycle_length; i++)
3735             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3736
3737         if(abs_frame_num > 0){
3738             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3739             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3740
3741             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3742             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3743                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3744         } else
3745             expectedpoc = 0;
3746
3747         if(h->nal_ref_idc == 0)
3748             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3749
3750         field_poc[0] = expectedpoc + h->delta_poc[0];
3751         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3752
3753         if(s->picture_structure == PICT_FRAME)
3754             field_poc[1] += h->delta_poc[1];
3755     }else{
3756         int poc;
3757             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3758             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3759         field_poc[0]= poc;
3760         field_poc[1]= poc;
3761     }
3762
3763     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3764         s->current_picture_ptr->field_poc[0]= field_poc[0];
3765         s->current_picture_ptr->poc = field_poc[0];
3766     }
3767     if(s->picture_structure != PICT_TOP_FIELD) {
3768         s->current_picture_ptr->field_poc[1]= field_poc[1];
3769         s->current_picture_ptr->poc = field_poc[1];
3770     }
3771     if(!FIELD_PICTURE || !s->first_field) {
3772         Picture *cur = s->current_picture_ptr;
3773         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3774     }
3775
3776     return 0;
3777 }
3778
3779
3780 /**
3781  * initialize scan tables
3782  */
3783 static void init_scan_tables(H264Context *h){
3784     MpegEncContext * const s = &h->s;
3785     int i;
3786     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3787         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3788         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3789     }else{
3790         for(i=0; i<16; i++){
3791 #define T(x) (x>>2) | ((x<<2) & 0xF)
3792             h->zigzag_scan[i] = T(zigzag_scan[i]);
3793             h-> field_scan[i] = T( field_scan[i]);
3794 #undef T
3795         }
3796     }
3797     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3798         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3799         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3800         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3801         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3802     }else{
3803         for(i=0; i<64; i++){
3804 #define T(x) (x>>3) | ((x&7)<<3)
3805             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3806             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3807             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3808             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3809 #undef T
3810         }
3811     }
3812     if(h->sps.transform_bypass){ //FIXME same ugly
3813         h->zigzag_scan_q0          = zigzag_scan;
3814         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3815         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3816         h->field_scan_q0           = field_scan;
3817         h->field_scan8x8_q0        = field_scan8x8;
3818         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3819     }else{
3820         h->zigzag_scan_q0          = h->zigzag_scan;
3821         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3822         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3823         h->field_scan_q0           = h->field_scan;
3824         h->field_scan8x8_q0        = h->field_scan8x8;
3825         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3826     }
3827 }
3828
3829 /**
3830  * Replicates H264 "master" context to thread contexts.
3831  */
3832 static void clone_slice(H264Context *dst, H264Context *src)
3833 {
3834     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3835     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3836     dst->s.current_picture      = src->s.current_picture;
3837     dst->s.linesize             = src->s.linesize;
3838     dst->s.uvlinesize           = src->s.uvlinesize;
3839     dst->s.first_field          = src->s.first_field;
3840
3841     dst->prev_poc_msb           = src->prev_poc_msb;
3842     dst->prev_poc_lsb           = src->prev_poc_lsb;
3843     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3844     dst->prev_frame_num         = src->prev_frame_num;
3845     dst->short_ref_count        = src->short_ref_count;
3846
3847     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3848     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3849     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3850     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3851
3852     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3853     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3854 }
3855
3856 /**
3857  * decodes a slice header.
3858  * This will also call MPV_common_init() and frame_start() as needed.
3859  *
3860  * @param h h264context
3861  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3862  *
3863  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3864  */
3865 static int decode_slice_header(H264Context *h, H264Context *h0){
3866     MpegEncContext * const s = &h->s;
3867     MpegEncContext * const s0 = &h0->s;
3868     unsigned int first_mb_in_slice;
3869     unsigned int pps_id;
3870     int num_ref_idx_active_override_flag;
3871     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3872     unsigned int slice_type, tmp, i, j;
3873     int default_ref_list_done = 0;
3874     int last_pic_structure;
3875
3876     s->dropable= h->nal_ref_idc == 0;
3877
3878     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3879         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3880         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3881     }else{
3882         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3883         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3884     }
3885
3886     first_mb_in_slice= get_ue_golomb(&s->gb);
3887
3888     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3889         h0->current_slice = 0;
3890         if (!s0->first_field)
3891             s->current_picture_ptr= NULL;
3892     }
3893
3894     slice_type= get_ue_golomb(&s->gb);
3895     if(slice_type > 9){
3896         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3897         return -1;
3898     }
3899     if(slice_type > 4){
3900         slice_type -= 5;
3901         h->slice_type_fixed=1;
3902     }else
3903         h->slice_type_fixed=0;
3904
3905     slice_type= slice_type_map[ slice_type ];
3906     if (slice_type == FF_I_TYPE
3907         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3908         default_ref_list_done = 1;
3909     }
3910     h->slice_type= slice_type;
3911     h->slice_type_nos= slice_type & 3;
3912
3913     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3914     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3915         av_log(h->s.avctx, AV_LOG_ERROR,
3916                "B picture before any references, skipping\n");
3917         return -1;
3918     }
3919
3920     pps_id= get_ue_golomb(&s->gb);
3921     if(pps_id>=MAX_PPS_COUNT){
3922         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3923         return -1;
3924     }
3925     if(!h0->pps_buffers[pps_id]) {
3926         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3927         return -1;
3928     }
3929     h->pps= *h0->pps_buffers[pps_id];
3930
3931     if(!h0->sps_buffers[h->pps.sps_id]) {
3932         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3933         return -1;
3934     }
3935     h->sps = *h0->sps_buffers[h->pps.sps_id];
3936
3937     if(h == h0 && h->dequant_coeff_pps != pps_id){
3938         h->dequant_coeff_pps = pps_id;
3939         init_dequant_tables(h);
3940     }
3941
3942     s->mb_width= h->sps.mb_width;
3943     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3944
3945     h->b_stride=  s->mb_width*4;
3946     h->b8_stride= s->mb_width*2;
3947
3948     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3949     if(h->sps.frame_mbs_only_flag)
3950         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3951     else
3952         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3953
3954     if (s->context_initialized
3955         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3956         if(h != h0)
3957             return -1;   // width / height changed during parallelized decoding
3958         free_tables(h);
3959         MPV_common_end(s);
3960     }
3961     if (!s->context_initialized) {
3962         if(h != h0)
3963             return -1;  // we cant (re-)initialize context during parallel decoding
3964         if (MPV_common_init(s) < 0)
3965             return -1;
3966         s->first_field = 0;
3967
3968         init_scan_tables(h);
3969         alloc_tables(h);
3970
3971         for(i = 1; i < s->avctx->thread_count; i++) {
3972             H264Context *c;
3973             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3974             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3975             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3976             c->sps = h->sps;
3977             c->pps = h->pps;
3978             init_scan_tables(c);
3979             clone_tables(c, h);
3980         }
3981
3982         for(i = 0; i < s->avctx->thread_count; i++)
3983             if(context_init(h->thread_context[i]) < 0)
3984                 return -1;
3985
3986         s->avctx->width = s->width;
3987         s->avctx->height = s->height;
3988         s->avctx->sample_aspect_ratio= h->sps.sar;
3989         if(!s->avctx->sample_aspect_ratio.den)
3990             s->avctx->sample_aspect_ratio.den = 1;
3991
3992         if(h->sps.timing_info_present_flag){
3993             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3994             if(h->x264_build > 0 && h->x264_build < 44)
3995                 s->avctx->time_base.den *= 2;
3996             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3997                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3998         }
3999     }
4000
4001     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4002
4003     h->mb_mbaff = 0;
4004     h->mb_aff_frame = 0;
4005     last_pic_structure = s0->picture_structure;
4006     if(h->sps.frame_mbs_only_flag){
4007         s->picture_structure= PICT_FRAME;
4008     }else{
4009         if(get_bits1(&s->gb)) { //field_pic_flag
4010             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4011         } else {
4012             s->picture_structure= PICT_FRAME;
4013             h->mb_aff_frame = h->sps.mb_aff;
4014         }
4015     }
4016
4017     if(h0->current_slice == 0){
4018         while(h->frame_num !=  h->prev_frame_num &&
4019               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
4020             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
4021             frame_start(h);
4022             h->prev_frame_num++;
4023             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
4024             s->current_picture_ptr->frame_num= h->prev_frame_num;
4025             execute_ref_pic_marking(h, NULL, 0);
4026         }
4027
4028         /* See if we have a decoded first field looking for a pair... */
4029         if (s0->first_field) {
4030             assert(s0->current_picture_ptr);
4031             assert(s0->current_picture_ptr->data[0]);
4032             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4033
4034             /* figure out if we have a complementary field pair */
4035             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4036                 /*
4037                  * Previous field is unmatched. Don't display it, but let it
4038                  * remain for reference if marked as such.
4039                  */
4040                 s0->current_picture_ptr = NULL;
4041                 s0->first_field = FIELD_PICTURE;
4042
4043             } else {
4044                 if (h->nal_ref_idc &&
4045                         s0->current_picture_ptr->reference &&
4046                         s0->current_picture_ptr->frame_num != h->frame_num) {
4047                     /*
4048                      * This and previous field were reference, but had
4049                      * different frame_nums. Consider this field first in
4050                      * pair. Throw away previous field except for reference
4051                      * purposes.
4052                      */
4053                     s0->first_field = 1;
4054                     s0->current_picture_ptr = NULL;
4055
4056                 } else {
4057                     /* Second field in complementary pair */
4058                     s0->first_field = 0;
4059                 }
4060             }
4061
4062         } else {
4063             /* Frame or first field in a potentially complementary pair */
4064             assert(!s0->current_picture_ptr);
4065             s0->first_field = FIELD_PICTURE;
4066         }
4067
4068         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4069             s0->first_field = 0;
4070             return -1;
4071         }
4072     }
4073     if(h != h0)
4074         clone_slice(h, h0);
4075
4076     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4077
4078     assert(s->mb_num == s->mb_width * s->mb_height);
4079     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4080        first_mb_in_slice                    >= s->mb_num){
4081         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4082         return -1;
4083     }
4084     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4085     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4086     if (s->picture_structure == PICT_BOTTOM_FIELD)
4087         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4088     assert(s->mb_y < s->mb_height);
4089
4090     if(s->picture_structure==PICT_FRAME){
4091         h->curr_pic_num=   h->frame_num;
4092         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4093     }else{
4094         h->curr_pic_num= 2*h->frame_num + 1;
4095         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4096     }
4097
4098     if(h->nal_unit_type == NAL_IDR_SLICE){
4099         get_ue_golomb(&s->gb); /* idr_pic_id */
4100     }
4101
4102     if(h->sps.poc_type==0){
4103         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4104
4105         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4106             h->delta_poc_bottom= get_se_golomb(&s->gb);
4107         }
4108     }
4109
4110     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4111         h->delta_poc[0]= get_se_golomb(&s->gb);
4112
4113         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4114             h->delta_poc[1]= get_se_golomb(&s->gb);
4115     }
4116
4117     init_poc(h);
4118
4119     if(h->pps.redundant_pic_cnt_present){
4120         h->redundant_pic_count= get_ue_golomb(&s->gb);
4121     }
4122
4123     //set defaults, might be overridden a few lines later
4124     h->ref_count[0]= h->pps.ref_count[0];
4125     h->ref_count[1]= h->pps.ref_count[1];
4126
4127     if(h->slice_type_nos != FF_I_TYPE){
4128         if(h->slice_type_nos == FF_B_TYPE){
4129             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4130         }
4131         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4132
4133         if(num_ref_idx_active_override_flag){
4134             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4135             if(h->slice_type_nos==FF_B_TYPE)
4136                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4137
4138             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4139                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4140                 h->ref_count[0]= h->ref_count[1]= 1;
4141                 return -1;
4142             }
4143         }
4144         if(h->slice_type_nos == FF_B_TYPE)
4145             h->list_count= 2;
4146         else
4147             h->list_count= 1;
4148     }else
4149         h->list_count= 0;
4150
4151     if(!default_ref_list_done){
4152         fill_default_ref_list(h);
4153     }
4154
4155     if(decode_ref_pic_list_reordering(h) < 0)
4156         return -1;
4157
4158     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4159        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4160         pred_weight_table(h);
4161     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4162         implicit_weight_table(h);
4163     else
4164         h->use_weight = 0;
4165
4166     if(h->nal_ref_idc)
4167         decode_ref_pic_marking(h0, &s->gb);
4168
4169     if(FRAME_MBAFF)
4170         fill_mbaff_ref_list(h);
4171
4172     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4173         tmp = get_ue_golomb(&s->gb);
4174         if(tmp > 2){
4175             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4176             return -1;
4177         }
4178         h->cabac_init_idc= tmp;
4179     }
4180
4181     h->last_qscale_diff = 0;
4182     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4183     if(tmp>51){
4184         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4185         return -1;
4186     }
4187     s->qscale= tmp;
4188     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4189     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4190     //FIXME qscale / qp ... stuff
4191     if(h->slice_type == FF_SP_TYPE){
4192         get_bits1(&s->gb); /* sp_for_switch_flag */
4193     }
4194     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4195         get_se_golomb(&s->gb); /* slice_qs_delta */
4196     }
4197
4198     h->deblocking_filter = 1;
4199     h->slice_alpha_c0_offset = 0;
4200     h->slice_beta_offset = 0;
4201     if( h->pps.deblocking_filter_parameters_present ) {
4202         tmp= get_ue_golomb(&s->gb);
4203         if(tmp > 2){
4204             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4205             return -1;
4206         }
4207         h->deblocking_filter= tmp;
4208         if(h->deblocking_filter < 2)
4209             h->deblocking_filter^= 1; // 1<->0
4210
4211         if( h->deblocking_filter ) {
4212             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4213             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4214         }
4215     }
4216
4217     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4218        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4219        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4220        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4221         h->deblocking_filter= 0;
4222
4223     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4224         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4225             /* Cheat slightly for speed:
4226                Do not bother to deblock across slices. */
4227             h->deblocking_filter = 2;
4228         } else {
4229             h0->max_contexts = 1;
4230             if(!h0->single_decode_warning) {
4231                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4232                 h0->single_decode_warning = 1;
4233             }
4234             if(h != h0)
4235                 return 1; // deblocking switched inside frame
4236         }
4237     }
4238
4239 #if 0 //FMO
4240     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4241         slice_group_change_cycle= get_bits(&s->gb, ?);
4242 #endif
4243
4244     h0->last_slice_type = slice_type;
4245     h->slice_num = ++h0->current_slice;
4246
4247     for(j=0; j<2; j++){
4248         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4249         ref2frm[0]=
4250         ref2frm[1]= -1;
4251         for(i=0; i<48; i++)
4252             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4253                           +(h->ref_list[j][i].reference&3);
4254     }
4255
4256     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4257     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4258
4259     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4260         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4261                h->slice_num,
4262                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4263                first_mb_in_slice,
4264                av_get_pict_type_char(h->slice_type),
4265                pps_id, h->frame_num,
4266                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4267                h->ref_count[0], h->ref_count[1],
4268                s->qscale,
4269                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4270                h->use_weight,
4271                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4272                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4273                );
4274     }
4275
4276     return 0;
4277 }
4278
4279 /**
4280  *
4281  */
4282 static inline int get_level_prefix(GetBitContext *gb){
4283     unsigned int buf;
4284     int log;
4285
4286     OPEN_READER(re, gb);
4287     UPDATE_CACHE(re, gb);
4288     buf=GET_CACHE(re, gb);
4289
4290     log= 32 - av_log2(buf);
4291 #ifdef TRACE
4292     print_bin(buf>>(32-log), log);
4293     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4294 #endif
4295
4296     LAST_SKIP_BITS(re, gb, log);
4297     CLOSE_READER(re, gb);
4298
4299     return log-1;
4300 }
4301
4302 static inline int get_dct8x8_allowed(H264Context *h){
4303     int i;
4304     for(i=0; i<4; i++){
4305         if(!IS_SUB_8X8(h->sub_mb_type[i])
4306            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4307             return 0;
4308     }
4309     return 1;
4310 }
4311
4312 /**
4313  * decodes a residual block.
4314  * @param n block index
4315  * @param scantable scantable
4316  * @param max_coeff number of coefficients in the block
4317  * @return <0 if an error occurred
4318  */
4319 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4320     MpegEncContext * const s = &h->s;
4321     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4322     int level[16];
4323     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4324
4325     //FIXME put trailing_onex into the context
4326
4327     if(n == CHROMA_DC_BLOCK_INDEX){
4328         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4329         total_coeff= coeff_token>>2;
4330     }else{
4331         if(n == LUMA_DC_BLOCK_INDEX){
4332             total_coeff= pred_non_zero_count(h, 0);
4333             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4334             total_coeff= coeff_token>>2;
4335         }else{
4336             total_coeff= pred_non_zero_count(h, n);
4337             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4338             total_coeff= coeff_token>>2;
4339             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4340         }
4341     }
4342
4343     //FIXME set last_non_zero?
4344
4345     if(total_coeff==0)
4346         return 0;
4347     if(total_coeff > (unsigned)max_coeff) {
4348         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4349         return -1;
4350     }
4351
4352     trailing_ones= coeff_token&3;
4353     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4354     assert(total_coeff<=16);
4355
4356     for(i=0; i<trailing_ones; i++){
4357         level[i]= 1 - 2*get_bits1(gb);
4358     }
4359
4360     if(i<total_coeff) {
4361         int level_code, mask;
4362         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4363         int prefix= get_level_prefix(gb);
4364
4365         //first coefficient has suffix_length equal to 0 or 1
4366         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4367             if(suffix_length)
4368                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4369             else
4370                 level_code= (prefix<<suffix_length); //part
4371         }else if(prefix==14){
4372             if(suffix_length)
4373                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4374             else
4375                 level_code= prefix + get_bits(gb, 4); //part
4376         }else{
4377             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4378             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4379             if(prefix>=16)
4380                 level_code += (1<<(prefix-3))-4096;
4381         }
4382
4383         if(trailing_ones < 3) level_code += 2;
4384
4385         suffix_length = 1;
4386         if(level_code > 5)
4387             suffix_length++;
4388         mask= -(level_code&1);
4389         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4390         i++;
4391
4392         //remaining coefficients have suffix_length > 0
4393         for(;i<total_coeff;i++) {
4394             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4395             prefix = get_level_prefix(gb);
4396             if(prefix<15){
4397                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4398             }else{
4399                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4400                 if(prefix>=16)
4401                     level_code += (1<<(prefix-3))-4096;
4402             }
4403             mask= -(level_code&1);
4404             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4405             if(level_code > suffix_limit[suffix_length])
4406                 suffix_length++;
4407         }
4408     }
4409
4410     if(total_coeff == max_coeff)
4411         zeros_left=0;
4412     else{
4413         if(n == CHROMA_DC_BLOCK_INDEX)
4414             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4415         else
4416             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4417     }
4418
4419     coeff_num = zeros_left + total_coeff - 1;
4420     j = scantable[coeff_num];
4421     if(n > 24){
4422         block[j] = level[0];
4423         for(i=1;i<total_coeff;i++) {
4424             if(zeros_left <= 0)
4425                 run_before = 0;
4426             else if(zeros_left < 7){
4427                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4428             }else{
4429                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4430             }
4431             zeros_left -= run_before;
4432             coeff_num -= 1 + run_before;
4433             j= scantable[ coeff_num ];
4434
4435             block[j]= level[i];
4436         }
4437     }else{
4438         block[j] = (level[0] * qmul[j] + 32)>>6;
4439         for(i=1;i<total_coeff;i++) {
4440             if(zeros_left <= 0)
4441                 run_before = 0;
4442             else if(zeros_left < 7){
4443                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4444             }else{
4445                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4446             }
4447             zeros_left -= run_before;
4448             coeff_num -= 1 + run_before;
4449             j= scantable[ coeff_num ];
4450
4451             block[j]= (level[i] * qmul[j] + 32)>>6;
4452         }
4453     }
4454
4455     if(zeros_left<0){
4456         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4457         return -1;
4458     }
4459
4460     return 0;
4461 }
4462
4463 static void predict_field_decoding_flag(H264Context *h){
4464     MpegEncContext * const s = &h->s;
4465     const int mb_xy= h->mb_xy;
4466     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4467                 ? s->current_picture.mb_type[mb_xy-1]
4468                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4469                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4470                 : 0;
4471     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4472 }
4473
4474 /**
4475  * decodes a P_SKIP or B_SKIP macroblock
4476  */
4477 static void decode_mb_skip(H264Context *h){
4478     MpegEncContext * const s = &h->s;
4479     const int mb_xy= h->mb_xy;
4480     int mb_type=0;
4481
4482     memset(h->non_zero_count[mb_xy], 0, 16);
4483     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4484
4485     if(MB_FIELD)
4486         mb_type|= MB_TYPE_INTERLACED;
4487
4488     if( h->slice_type_nos == FF_B_TYPE )
4489     {
4490         // just for fill_caches. pred_direct_motion will set the real mb_type
4491         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4492
4493         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4494         pred_direct_motion(h, &mb_type);
4495         mb_type|= MB_TYPE_SKIP;
4496     }
4497     else
4498     {
4499         int mx, my;
4500         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4501
4502         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4503         pred_pskip_motion(h, &mx, &my);
4504         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4505         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4506     }
4507
4508     write_back_motion(h, mb_type);
4509     s->current_picture.mb_type[mb_xy]= mb_type;
4510     s->current_picture.qscale_table[mb_xy]= s->qscale;
4511     h->slice_table[ mb_xy ]= h->slice_num;
4512     h->prev_mb_skipped= 1;
4513 }
4514
4515 /**
4516  * decodes a macroblock
4517  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4518  */
4519 static int decode_mb_cavlc(H264Context *h){
4520     MpegEncContext * const s = &h->s;
4521     int mb_xy;
4522     int partition_count;
4523     unsigned int mb_type, cbp;
4524     int dct8x8_allowed= h->pps.transform_8x8_mode;
4525
4526     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4527
4528     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4529
4530     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4531     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4532                 down the code */
4533     if(h->slice_type_nos != FF_I_TYPE){
4534         if(s->mb_skip_run==-1)
4535             s->mb_skip_run= get_ue_golomb(&s->gb);
4536
4537         if (s->mb_skip_run--) {
4538             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4539                 if(s->mb_skip_run==0)
4540                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4541                 else
4542                     predict_field_decoding_flag(h);
4543             }
4544             decode_mb_skip(h);
4545             return 0;
4546         }
4547     }
4548     if(FRAME_MBAFF){
4549         if( (s->mb_y&1) == 0 )
4550             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4551     }else
4552         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4553
4554     h->prev_mb_skipped= 0;
4555
4556     mb_type= get_ue_golomb(&s->gb);
4557     if(h->slice_type_nos == FF_B_TYPE){
4558         if(mb_type < 23){
4559             partition_count= b_mb_type_info[mb_type].partition_count;
4560             mb_type=         b_mb_type_info[mb_type].type;
4561         }else{
4562             mb_type -= 23;
4563             goto decode_intra_mb;
4564         }
4565     }else if(h->slice_type_nos == FF_P_TYPE){
4566         if(mb_type < 5){
4567             partition_count= p_mb_type_info[mb_type].partition_count;
4568             mb_type=         p_mb_type_info[mb_type].type;
4569         }else{
4570             mb_type -= 5;
4571             goto decode_intra_mb;
4572         }
4573     }else{
4574        assert(h->slice_type_nos == FF_I_TYPE);
4575         if(h->slice_type == FF_SI_TYPE && mb_type)
4576             mb_type--;
4577 decode_intra_mb:
4578         if(mb_type > 25){
4579             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4580             return -1;
4581         }
4582         partition_count=0;
4583         cbp= i_mb_type_info[mb_type].cbp;
4584         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4585         mb_type= i_mb_type_info[mb_type].type;
4586     }
4587
4588     if(MB_FIELD)
4589         mb_type |= MB_TYPE_INTERLACED;
4590
4591     h->slice_table[ mb_xy ]= h->slice_num;
4592
4593     if(IS_INTRA_PCM(mb_type)){
4594         unsigned int x, y;
4595
4596         // We assume these blocks are very rare so we do not optimize it.
4597         align_get_bits(&s->gb);
4598
4599         // The pixels are stored in the same order as levels in h->mb array.
4600         for(y=0; y<16; y++){
4601             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4602             for(x=0; x<16; x++){
4603                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4604                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4605             }
4606         }
4607         for(y=0; y<8; y++){
4608             const int index= 256 + 4*(y&3) + 32*(y>>2);
4609             for(x=0; x<8; x++){
4610                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4611                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4612             }
4613         }
4614         for(y=0; y<8; y++){
4615             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4616             for(x=0; x<8; x++){
4617                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4618                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4619             }
4620         }
4621
4622         // In deblocking, the quantizer is 0
4623         s->current_picture.qscale_table[mb_xy]= 0;
4624         // All coeffs are present
4625         memset(h->non_zero_count[mb_xy], 16, 16);
4626
4627         s->current_picture.mb_type[mb_xy]= mb_type;
4628         return 0;
4629     }
4630
4631     if(MB_MBAFF){
4632         h->ref_count[0] <<= 1;
4633         h->ref_count[1] <<= 1;
4634     }
4635
4636     fill_caches(h, mb_type, 0);
4637
4638     //mb_pred
4639     if(IS_INTRA(mb_type)){
4640             int pred_mode;
4641 //            init_top_left_availability(h);
4642             if(IS_INTRA4x4(mb_type)){
4643                 int i;
4644                 int di = 1;
4645                 if(dct8x8_allowed && get_bits1(&s->gb)){
4646                     mb_type |= MB_TYPE_8x8DCT;
4647                     di = 4;
4648                 }
4649
4650 //                fill_intra4x4_pred_table(h);
4651                 for(i=0; i<16; i+=di){
4652                     int mode= pred_intra_mode(h, i);
4653
4654                     if(!get_bits1(&s->gb)){
4655                         const int rem_mode= get_bits(&s->gb, 3);
4656                         mode = rem_mode + (rem_mode >= mode);
4657                     }
4658
4659                     if(di==4)
4660                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4661                     else
4662                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4663                 }
4664                 write_back_intra_pred_mode(h);
4665                 if( check_intra4x4_pred_mode(h) < 0)
4666                     return -1;
4667             }else{
4668                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4669                 if(h->intra16x16_pred_mode < 0)
4670                     return -1;
4671             }
4672
4673             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4674             if(pred_mode < 0)
4675                 return -1;
4676             h->chroma_pred_mode= pred_mode;
4677     }else if(partition_count==4){
4678         int i, j, sub_partition_count[4], list, ref[2][4];
4679
4680         if(h->slice_type_nos == FF_B_TYPE){
4681             for(i=0; i<4; i++){
4682                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4683                 if(h->sub_mb_type[i] >=13){
4684                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4685                     return -1;
4686                 }
4687                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4688                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4689             }
4690             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4691                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4692                 pred_direct_motion(h, &mb_type);
4693                 h->ref_cache[0][scan8[4]] =
4694                 h->ref_cache[1][scan8[4]] =
4695                 h->ref_cache[0][scan8[12]] =
4696                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4697             }
4698         }else{
4699             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4700             for(i=0; i<4; i++){
4701                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4702                 if(h->sub_mb_type[i] >=4){
4703                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4704                     return -1;
4705                 }
4706                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4707                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4708             }
4709         }
4710
4711         for(list=0; list<h->list_count; list++){
4712             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4713             for(i=0; i<4; i++){
4714                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4715                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4716                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4717                     if(tmp>=ref_count){
4718                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4719                         return -1;
4720                     }
4721                     ref[list][i]= tmp;
4722                 }else{
4723                  //FIXME
4724                     ref[list][i] = -1;
4725                 }
4726             }
4727         }
4728
4729         if(dct8x8_allowed)
4730             dct8x8_allowed = get_dct8x8_allowed(h);
4731
4732         for(list=0; list<h->list_count; list++){
4733             for(i=0; i<4; i++){
4734                 if(IS_DIRECT(h->sub_mb_type[i])) {
4735                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4736                     continue;
4737                 }
4738                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4739                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4740
4741                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4742                     const int sub_mb_type= h->sub_mb_type[i];
4743                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4744                     for(j=0; j<sub_partition_count[i]; j++){
4745                         int mx, my;
4746                         const int index= 4*i + block_width*j;
4747                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4748                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4749                         mx += get_se_golomb(&s->gb);
4750                         my += get_se_golomb(&s->gb);
4751                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4752
4753                         if(IS_SUB_8X8(sub_mb_type)){
4754                             mv_cache[ 1 ][0]=
4755                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4756                             mv_cache[ 1 ][1]=
4757                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4758                         }else if(IS_SUB_8X4(sub_mb_type)){
4759                             mv_cache[ 1 ][0]= mx;
4760                             mv_cache[ 1 ][1]= my;
4761                         }else if(IS_SUB_4X8(sub_mb_type)){
4762                             mv_cache[ 8 ][0]= mx;
4763                             mv_cache[ 8 ][1]= my;
4764                         }
4765                         mv_cache[ 0 ][0]= mx;
4766                         mv_cache[ 0 ][1]= my;
4767                     }
4768                 }else{
4769                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4770                     p[0] = p[1]=
4771                     p[8] = p[9]= 0;
4772                 }
4773             }
4774         }
4775     }else if(IS_DIRECT(mb_type)){
4776         pred_direct_motion(h, &mb_type);
4777         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4778     }else{
4779         int list, mx, my, i;
4780          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4781         if(IS_16X16(mb_type)){
4782             for(list=0; list<h->list_count; list++){
4783                     unsigned int val;
4784                     if(IS_DIR(mb_type, 0, list)){
4785                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4786                         if(val >= h->ref_count[list]){
4787                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4788                             return -1;
4789                         }
4790                     }else
4791                         val= LIST_NOT_USED&0xFF;
4792                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4793             }
4794             for(list=0; list<h->list_count; list++){
4795                 unsigned int val;
4796                 if(IS_DIR(mb_type, 0, list)){
4797                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4798                     mx += get_se_golomb(&s->gb);
4799                     my += get_se_golomb(&s->gb);
4800                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4801
4802                     val= pack16to32(mx,my);
4803                 }else
4804                     val=0;
4805                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4806             }
4807         }
4808         else if(IS_16X8(mb_type)){
4809             for(list=0; list<h->list_count; list++){
4810                     for(i=0; i<2; i++){
4811                         unsigned int val;
4812                         if(IS_DIR(mb_type, i, list)){
4813                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4814                             if(val >= h->ref_count[list]){
4815                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4816                                 return -1;
4817                             }
4818                         }else
4819                             val= LIST_NOT_USED&0xFF;
4820                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4821                     }
4822             }
4823             for(list=0; list<h->list_count; list++){
4824                 for(i=0; i<2; i++){
4825                     unsigned int val;
4826                     if(IS_DIR(mb_type, i, list)){
4827                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4828                         mx += get_se_golomb(&s->gb);
4829                         my += get_se_golomb(&s->gb);
4830                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4831
4832                         val= pack16to32(mx,my);
4833                     }else
4834                         val=0;
4835                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4836                 }
4837             }
4838         }else{
4839             assert(IS_8X16(mb_type));
4840             for(list=0; list<h->list_count; list++){
4841                     for(i=0; i<2; i++){
4842                         unsigned int val;
4843                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4844                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4845                             if(val >= h->ref_count[list]){
4846                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4847                                 return -1;
4848                             }
4849                         }else
4850                             val= LIST_NOT_USED&0xFF;
4851                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4852                     }
4853             }
4854             for(list=0; list<h->list_count; list++){
4855                 for(i=0; i<2; i++){
4856                     unsigned int val;
4857                     if(IS_DIR(mb_type, i, list)){
4858                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4859                         mx += get_se_golomb(&s->gb);
4860                         my += get_se_golomb(&s->gb);
4861                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4862
4863                         val= pack16to32(mx,my);
4864                     }else
4865                         val=0;
4866                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4867                 }
4868             }
4869         }
4870     }
4871
4872     if(IS_INTER(mb_type))
4873         write_back_motion(h, mb_type);
4874
4875     if(!IS_INTRA16x16(mb_type)){
4876         cbp= get_ue_golomb(&s->gb);
4877         if(cbp > 47){
4878             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4879             return -1;
4880         }
4881
4882         if(IS_INTRA4x4(mb_type))
4883             cbp= golomb_to_intra4x4_cbp[cbp];
4884         else
4885             cbp= golomb_to_inter_cbp[cbp];
4886     }
4887     h->cbp = cbp;
4888
4889     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4890         if(get_bits1(&s->gb)){
4891             mb_type |= MB_TYPE_8x8DCT;
4892             h->cbp_table[mb_xy]= cbp;
4893         }
4894     }
4895     s->current_picture.mb_type[mb_xy]= mb_type;
4896
4897     if(cbp || IS_INTRA16x16(mb_type)){
4898         int i8x8, i4x4, chroma_idx;
4899         int dquant;
4900         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4901         const uint8_t *scan, *scan8x8, *dc_scan;
4902
4903 //        fill_non_zero_count_cache(h);
4904
4905         if(IS_INTERLACED(mb_type)){
4906             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4907             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4908             dc_scan= luma_dc_field_scan;
4909         }else{
4910             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4911             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4912             dc_scan= luma_dc_zigzag_scan;
4913         }
4914
4915         dquant= get_se_golomb(&s->gb);
4916
4917         if( dquant > 25 || dquant < -26 ){
4918             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4919             return -1;
4920         }
4921
4922         s->qscale += dquant;
4923         if(((unsigned)s->qscale) > 51){
4924             if(s->qscale<0) s->qscale+= 52;
4925             else            s->qscale-= 52;
4926         }
4927
4928         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4929         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4930         if(IS_INTRA16x16(mb_type)){
4931             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4932                 return -1; //FIXME continue if partitioned and other return -1 too
4933             }
4934
4935             assert((cbp&15) == 0 || (cbp&15) == 15);
4936
4937             if(cbp&15){
4938                 for(i8x8=0; i8x8<4; i8x8++){
4939                     for(i4x4=0; i4x4<4; i4x4++){
4940                         const int index= i4x4 + 4*i8x8;
4941                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4942                             return -1;
4943                         }
4944                     }
4945                 }
4946             }else{
4947                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4948             }
4949         }else{
4950             for(i8x8=0; i8x8<4; i8x8++){
4951                 if(cbp & (1<<i8x8)){
4952                     if(IS_8x8DCT(mb_type)){
4953                         DCTELEM *buf = &h->mb[64*i8x8];
4954                         uint8_t *nnz;
4955                         for(i4x4=0; i4x4<4; i4x4++){
4956                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4957                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4958                                 return -1;
4959                         }
4960                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4961                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4962                     }else{
4963                         for(i4x4=0; i4x4<4; i4x4++){
4964                             const int index= i4x4 + 4*i8x8;
4965
4966                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4967                                 return -1;
4968                             }
4969                         }
4970                     }
4971                 }else{
4972                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4973                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4974                 }
4975             }
4976         }
4977
4978         if(cbp&0x30){
4979             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4980                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4981                     return -1;
4982                 }
4983         }
4984
4985         if(cbp&0x20){
4986             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4987                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4988                 for(i4x4=0; i4x4<4; i4x4++){
4989                     const int index= 16 + 4*chroma_idx + i4x4;
4990                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4991                         return -1;
4992                     }
4993                 }
4994             }
4995         }else{
4996             uint8_t * const nnz= &h->non_zero_count_cache[0];
4997             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4998             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4999         }
5000     }else{
5001         uint8_t * const nnz= &h->non_zero_count_cache[0];
5002         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5003         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5004         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5005     }
5006     s->current_picture.qscale_table[mb_xy]= s->qscale;
5007     write_back_non_zero_count(h);
5008
5009     if(MB_MBAFF){
5010         h->ref_count[0] >>= 1;
5011         h->ref_count[1] >>= 1;
5012     }
5013
5014     return 0;
5015 }
5016
5017 static int decode_cabac_field_decoding_flag(H264Context *h) {
5018     MpegEncContext * const s = &h->s;
5019     const int mb_x = s->mb_x;
5020     const int mb_y = s->mb_y & ~1;
5021     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5022     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5023
5024     unsigned int ctx = 0;
5025
5026     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5027         ctx += 1;
5028     }
5029     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5030         ctx += 1;
5031     }
5032
5033     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5034 }
5035
5036 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5037     uint8_t *state= &h->cabac_state[ctx_base];
5038     int mb_type;
5039
5040     if(intra_slice){
5041         MpegEncContext * const s = &h->s;
5042         const int mba_xy = h->left_mb_xy[0];
5043         const int mbb_xy = h->top_mb_xy;
5044         int ctx=0;
5045         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5046             ctx++;
5047         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5048             ctx++;
5049         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5050             return 0;   /* I4x4 */
5051         state += 2;
5052     }else{
5053         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5054             return 0;   /* I4x4 */
5055     }
5056
5057     if( get_cabac_terminate( &h->cabac ) )
5058         return 25;  /* PCM */
5059
5060     mb_type = 1; /* I16x16 */
5061     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5062     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5063         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5064     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5065     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5066     return mb_type;
5067 }
5068
5069 static int decode_cabac_mb_type( H264Context *h ) {
5070     MpegEncContext * const s = &h->s;
5071
5072     if( h->slice_type_nos == FF_I_TYPE ) {
5073         return decode_cabac_intra_mb_type(h, 3, 1);
5074     } else if( h->slice_type_nos == FF_P_TYPE ) {
5075         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5076             /* P-type */
5077             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5078                 /* P_L0_D16x16, P_8x8 */
5079                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5080             } else {
5081                 /* P_L0_D8x16, P_L0_D16x8 */
5082                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5083             }
5084         } else {
5085             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5086         }
5087     } else if( h->slice_type_nos == FF_B_TYPE ) {
5088         const int mba_xy = h->left_mb_xy[0];
5089         const int mbb_xy = h->top_mb_xy;
5090         int ctx = 0;
5091         int bits;
5092
5093         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5094             ctx++;
5095         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5096             ctx++;
5097
5098         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5099             return 0; /* B_Direct_16x16 */
5100
5101         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5102             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5103         }
5104
5105         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5106         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5107         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5108         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5109         if( bits < 8 )
5110             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5111         else if( bits == 13 ) {
5112             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5113         } else if( bits == 14 )
5114             return 11; /* B_L1_L0_8x16 */
5115         else if( bits == 15 )
5116             return 22; /* B_8x8 */
5117
5118         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5119         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5120     } else {
5121         /* TODO SI/SP frames? */
5122         return -1;
5123     }
5124 }
5125
5126 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5127     MpegEncContext * const s = &h->s;
5128     int mba_xy, mbb_xy;
5129     int ctx = 0;
5130
5131     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5132         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5133         mba_xy = mb_xy - 1;
5134         if( (mb_y&1)
5135             && h->slice_table[mba_xy] == h->slice_num
5136             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5137             mba_xy += s->mb_stride;
5138         if( MB_FIELD ){
5139             mbb_xy = mb_xy - s->mb_stride;
5140             if( !(mb_y&1)
5141                 && h->slice_table[mbb_xy] == h->slice_num
5142                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5143                 mbb_xy -= s->mb_stride;
5144         }else
5145             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5146     }else{
5147         int mb_xy = h->mb_xy;
5148         mba_xy = mb_xy - 1;
5149         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5150     }
5151
5152     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5153         ctx++;
5154     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5155         ctx++;
5156
5157     if( h->slice_type_nos == FF_B_TYPE )
5158         ctx += 13;
5159     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5160 }
5161
5162 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5163     int mode = 0;
5164
5165     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5166         return pred_mode;
5167
5168     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5169     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5170     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5171
5172     if( mode >= pred_mode )
5173         return mode + 1;
5174     else
5175         return mode;
5176 }
5177
5178 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5179     const int mba_xy = h->left_mb_xy[0];
5180     const int mbb_xy = h->top_mb_xy;
5181
5182     int ctx = 0;
5183
5184     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5185     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5186         ctx++;
5187
5188     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5189         ctx++;
5190
5191     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5192         return 0;
5193
5194     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5195         return 1;
5196     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5197         return 2;
5198     else
5199         return 3;
5200 }
5201
5202 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5203     int cbp_b, cbp_a, ctx, cbp = 0;
5204
5205     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5206     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5207
5208     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5209     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5210     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5211     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5212     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5213     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5214     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5215     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5216     return cbp;
5217 }
5218 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5219     int ctx;
5220     int cbp_a, cbp_b;
5221
5222     cbp_a = (h->left_cbp>>4)&0x03;
5223     cbp_b = (h-> top_cbp>>4)&0x03;
5224
5225     ctx = 0;
5226     if( cbp_a > 0 ) ctx++;
5227     if( cbp_b > 0 ) ctx += 2;
5228     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5229         return 0;
5230
5231     ctx = 4;
5232     if( cbp_a == 2 ) ctx++;
5233     if( cbp_b == 2 ) ctx += 2;
5234     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5235 }
5236 static int decode_cabac_mb_dqp( H264Context *h) {
5237     int   ctx = 0;
5238     int   val = 0;
5239
5240     if( h->last_qscale_diff != 0 )
5241         ctx++;
5242
5243     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5244         if( ctx < 2 )
5245             ctx = 2;
5246         else
5247             ctx = 3;
5248         val++;
5249         if(val > 102) //prevent infinite loop
5250             return INT_MIN;
5251     }
5252
5253     if( val&0x01 )
5254         return (val + 1)/2;
5255     else
5256         return -(val + 1)/2;
5257 }
5258 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5259     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5260         return 0;   /* 8x8 */
5261     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5262         return 1;   /* 8x4 */
5263     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5264         return 2;   /* 4x8 */
5265     return 3;       /* 4x4 */
5266 }
5267 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5268     int type;
5269     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5270         return 0;   /* B_Direct_8x8 */
5271     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5272         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5273     type = 3;
5274     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5275         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5276             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5277         type += 4;
5278     }
5279     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5280     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5281     return type;
5282 }
5283
5284 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5285     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5286 }
5287
5288 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5289     int refa = h->ref_cache[list][scan8[n] - 1];
5290     int refb = h->ref_cache[list][scan8[n] - 8];
5291     int ref  = 0;
5292     int ctx  = 0;
5293
5294     if( h->slice_type_nos == FF_B_TYPE) {
5295         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5296             ctx++;
5297         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5298             ctx += 2;
5299     } else {
5300         if( refa > 0 )
5301             ctx++;
5302         if( refb > 0 )
5303             ctx += 2;
5304     }
5305
5306     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5307         ref++;
5308         if( ctx < 4 )
5309             ctx = 4;
5310         else
5311             ctx = 5;
5312         if(ref >= 32 /*h->ref_list[list]*/){
5313             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5314             return 0; //FIXME we should return -1 and check the return everywhere
5315         }
5316     }
5317     return ref;
5318 }
5319
5320 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5321     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5322                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5323     int ctxbase = (l == 0) ? 40 : 47;
5324     int ctx, mvd;
5325
5326     if( amvd < 3 )
5327         ctx = 0;
5328     else if( amvd > 32 )
5329         ctx = 2;
5330     else
5331         ctx = 1;
5332
5333     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5334         return 0;
5335
5336     mvd= 1;
5337     ctx= 3;
5338     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5339         mvd++;
5340         if( ctx < 6 )
5341             ctx++;
5342     }
5343
5344     if( mvd >= 9 ) {
5345         int k = 3;
5346         while( get_cabac_bypass( &h->cabac ) ) {
5347             mvd += 1 << k;
5348             k++;
5349             if(k>24){
5350                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5351                 return INT_MIN;
5352             }
5353         }
5354         while( k-- ) {
5355             if( get_cabac_bypass( &h->cabac ) )
5356                 mvd += 1 << k;
5357         }
5358     }
5359     return get_cabac_bypass_sign( &h->cabac, -mvd );
5360 }
5361
5362 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5363     int nza, nzb;
5364     int ctx = 0;
5365
5366     if( is_dc ) {
5367         if( cat == 0 ) {
5368             nza = h->left_cbp&0x100;
5369             nzb = h-> top_cbp&0x100;
5370         } else {
5371             nza = (h->left_cbp>>(6+idx))&0x01;
5372             nzb = (h-> top_cbp>>(6+idx))&0x01;
5373         }
5374     } else {
5375         if( cat == 4 ) {
5376             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5377             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5378         } else {
5379             assert(cat == 1 || cat == 2);
5380             nza = h->non_zero_count_cache[scan8[idx] - 1];
5381             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5382         }
5383     }
5384
5385     if( nza > 0 )
5386         ctx++;
5387
5388     if( nzb > 0 )
5389         ctx += 2;
5390
5391     return ctx + 4 * cat;
5392 }
5393
5394 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5395     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5396     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5397     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5398     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5399 };
5400
5401 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5402     static const int significant_coeff_flag_offset[2][6] = {
5403       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5404       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5405     };
5406     static const int last_coeff_flag_offset[2][6] = {
5407       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5408       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5409     };
5410     static const int coeff_abs_level_m1_offset[6] = {
5411         227+0, 227+10, 227+20, 227+30, 227+39, 426
5412     };
5413     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5414       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5415         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5416         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5417        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5418       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5419         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5420         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5421         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5422     };
5423     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5424      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5425      * map node ctx => cabac ctx for level=1 */
5426     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5427     /* map node ctx => cabac ctx for level>1 */
5428     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5429     static const uint8_t coeff_abs_level_transition[2][8] = {
5430     /* update node ctx after decoding a level=1 */
5431         { 1, 2, 3, 3, 4, 5, 6, 7 },
5432     /* update node ctx after decoding a level>1 */
5433         { 4, 4, 4, 4, 5, 6, 7, 7 }
5434     };
5435
5436     int index[64];
5437
5438     int av_unused last;
5439     int coeff_count = 0;
5440     int node_ctx = 0;
5441
5442     uint8_t *significant_coeff_ctx_base;
5443     uint8_t *last_coeff_ctx_base;
5444     uint8_t *abs_level_m1_ctx_base;
5445
5446 #ifndef ARCH_X86
5447 #define CABAC_ON_STACK
5448 #endif
5449 #ifdef CABAC_ON_STACK
5450 #define CC &cc
5451     CABACContext cc;
5452     cc.range     = h->cabac.range;
5453     cc.low       = h->cabac.low;
5454     cc.bytestream= h->cabac.bytestream;
5455 #else
5456 #define CC &h->cabac
5457 #endif
5458
5459
5460     /* cat: 0-> DC 16x16  n = 0
5461      *      1-> AC 16x16  n = luma4x4idx
5462      *      2-> Luma4x4   n = luma4x4idx
5463      *      3-> DC Chroma n = iCbCr
5464      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5465      *      5-> Luma8x8   n = 4 * luma8x8idx
5466      */
5467
5468     /* read coded block flag */
5469     if( is_dc || cat != 5 ) {
5470         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5471             if( !is_dc ) {
5472                 if( cat == 4 )
5473                     h->non_zero_count_cache[scan8[16+n]] = 0;
5474                 else
5475                     h->non_zero_count_cache[scan8[n]] = 0;
5476             }
5477
5478 #ifdef CABAC_ON_STACK
5479             h->cabac.range     = cc.range     ;
5480             h->cabac.low       = cc.low       ;
5481             h->cabac.bytestream= cc.bytestream;
5482 #endif
5483             return;
5484         }
5485     }
5486
5487     significant_coeff_ctx_base = h->cabac_state
5488         + significant_coeff_flag_offset[MB_FIELD][cat];
5489     last_coeff_ctx_base = h->cabac_state
5490         + last_coeff_flag_offset[MB_FIELD][cat];
5491     abs_level_m1_ctx_base = h->cabac_state
5492         + coeff_abs_level_m1_offset[cat];
5493
5494     if( !is_dc && cat == 5 ) {
5495 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5496         for(last= 0; last < coefs; last++) { \
5497             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5498             if( get_cabac( CC, sig_ctx )) { \
5499                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5500                 index[coeff_count++] = last; \
5501                 if( get_cabac( CC, last_ctx ) ) { \
5502                     last= max_coeff; \
5503                     break; \
5504                 } \
5505             } \
5506         }\
5507         if( last == max_coeff -1 ) {\
5508             index[coeff_count++] = last;\
5509         }
5510         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5511 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5512         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5513     } else {
5514         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5515 #else
5516         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5517     } else {
5518         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5519 #endif
5520     }
5521     assert(coeff_count > 0);
5522
5523     if( is_dc ) {
5524         if( cat == 0 )
5525             h->cbp_table[h->mb_xy] |= 0x100;
5526         else
5527             h->cbp_table[h->mb_xy] |= 0x40 << n;
5528     } else {
5529         if( cat == 5 )
5530             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5531         else if( cat == 4 )
5532             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5533         else {
5534             assert( cat == 1 || cat == 2 );
5535             h->non_zero_count_cache[scan8[n]] = coeff_count;
5536         }
5537     }
5538
5539     while( coeff_count-- ) {
5540         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5541
5542         int j= scantable[index[coeff_count]];
5543
5544         if( get_cabac( CC, ctx ) == 0 ) {
5545             node_ctx = coeff_abs_level_transition[0][node_ctx];
5546             if( is_dc ) {
5547                 block[j] = get_cabac_bypass_sign( CC, -1);
5548             }else{
5549                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5550             }
5551         } else {
5552             int coeff_abs = 2;
5553             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5554             node_ctx = coeff_abs_level_transition[1][node_ctx];
5555
5556             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5557                 coeff_abs++;
5558             }
5559
5560             if( coeff_abs >= 15 ) {
5561                 int j = 0;
5562                 while( get_cabac_bypass( CC ) ) {
5563                     j++;
5564                 }
5565
5566                 coeff_abs=1;
5567                 while( j-- ) {
5568                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5569                 }
5570                 coeff_abs+= 14;
5571             }
5572
5573             if( is_dc ) {
5574                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5575             }else{
5576                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5577             }
5578         }
5579     }
5580 #ifdef CABAC_ON_STACK
5581             h->cabac.range     = cc.range     ;
5582             h->cabac.low       = cc.low       ;
5583             h->cabac.bytestream= cc.bytestream;
5584 #endif
5585
5586 }
5587
5588 #ifndef CONFIG_SMALL
5589 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5590     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5591 }
5592
5593 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5594     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5595 }
5596 #endif
5597
5598 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5599 #ifdef CONFIG_SMALL
5600     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5601 #else
5602     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5603     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5604 #endif
5605 }
5606
5607 static inline void compute_mb_neighbors(H264Context *h)
5608 {
5609     MpegEncContext * const s = &h->s;
5610     const int mb_xy  = h->mb_xy;
5611     h->top_mb_xy     = mb_xy - s->mb_stride;
5612     h->left_mb_xy[0] = mb_xy - 1;
5613     if(FRAME_MBAFF){
5614         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5615         const int top_pair_xy      = pair_xy     - s->mb_stride;
5616         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5617         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5618         const int curr_mb_frame_flag = !MB_FIELD;
5619         const int bottom = (s->mb_y & 1);
5620         if (bottom
5621                 ? !curr_mb_frame_flag // bottom macroblock
5622                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5623                 ) {
5624             h->top_mb_xy -= s->mb_stride;
5625         }
5626         if (left_mb_frame_flag != curr_mb_frame_flag) {
5627             h->left_mb_xy[0] = pair_xy - 1;
5628         }
5629     } else if (FIELD_PICTURE) {
5630         h->top_mb_xy -= s->mb_stride;
5631     }
5632     return;
5633 }
5634
5635 /**
5636  * decodes a macroblock
5637  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5638  */
5639 static int decode_mb_cabac(H264Context *h) {
5640     MpegEncContext * const s = &h->s;
5641     int mb_xy;
5642     int mb_type, partition_count, cbp = 0;
5643     int dct8x8_allowed= h->pps.transform_8x8_mode;
5644
5645     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5646
5647     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5648
5649     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5650     if( h->slice_type_nos != FF_I_TYPE ) {
5651         int skip;
5652         /* a skipped mb needs the aff flag from the following mb */
5653         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5654             predict_field_decoding_flag(h);
5655         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5656             skip = h->next_mb_skipped;
5657         else
5658             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5659         /* read skip flags */
5660         if( skip ) {
5661             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5662                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5663                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5664                 if(h->next_mb_skipped)
5665                     predict_field_decoding_flag(h);
5666                 else
5667                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5668             }
5669
5670             decode_mb_skip(h);
5671
5672             h->cbp_table[mb_xy] = 0;
5673             h->chroma_pred_mode_table[mb_xy] = 0;
5674             h->last_qscale_diff = 0;
5675
5676             return 0;
5677
5678         }
5679     }
5680     if(FRAME_MBAFF){
5681         if( (s->mb_y&1) == 0 )
5682             h->mb_mbaff =
5683             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5684     }else
5685         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5686
5687     h->prev_mb_skipped = 0;
5688
5689     compute_mb_neighbors(h);
5690     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5691         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5692         return -1;
5693     }
5694
5695     if( h->slice_type_nos == FF_B_TYPE ) {
5696         if( mb_type < 23 ){
5697             partition_count= b_mb_type_info[mb_type].partition_count;
5698             mb_type=         b_mb_type_info[mb_type].type;
5699         }else{
5700             mb_type -= 23;
5701             goto decode_intra_mb;
5702         }
5703     } else if( h->slice_type_nos == FF_P_TYPE ) {
5704         if( mb_type < 5) {
5705             partition_count= p_mb_type_info[mb_type].partition_count;
5706             mb_type=         p_mb_type_info[mb_type].type;
5707         } else {
5708             mb_type -= 5;
5709             goto decode_intra_mb;
5710         }
5711     } else {
5712         if(h->slice_type == FF_SI_TYPE && mb_type)
5713             mb_type--;
5714         assert(h->slice_type_nos == FF_I_TYPE);
5715 decode_intra_mb:
5716         partition_count = 0;
5717         cbp= i_mb_type_info[mb_type].cbp;
5718         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5719         mb_type= i_mb_type_info[mb_type].type;
5720     }
5721     if(MB_FIELD)
5722         mb_type |= MB_TYPE_INTERLACED;
5723
5724     h->slice_table[ mb_xy ]= h->slice_num;
5725
5726     if(IS_INTRA_PCM(mb_type)) {
5727         const uint8_t *ptr;
5728         unsigned int x, y;
5729
5730         // We assume these blocks are very rare so we do not optimize it.
5731         // FIXME The two following lines get the bitstream position in the cabac
5732         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5733         ptr= h->cabac.bytestream;
5734         if(h->cabac.low&0x1) ptr--;
5735         if(CABAC_BITS==16){
5736             if(h->cabac.low&0x1FF) ptr--;
5737         }
5738
5739         // The pixels are stored in the same order as levels in h->mb array.
5740         for(y=0; y<16; y++){
5741             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5742             for(x=0; x<16; x++){
5743                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5744                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5745             }
5746         }
5747         for(y=0; y<8; y++){
5748             const int index= 256 + 4*(y&3) + 32*(y>>2);
5749             for(x=0; x<8; x++){
5750                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5751                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5752             }
5753         }
5754         for(y=0; y<8; y++){
5755             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5756             for(x=0; x<8; x++){
5757                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5758                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5759             }
5760         }
5761
5762         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5763
5764         // All blocks are present
5765         h->cbp_table[mb_xy] = 0x1ef;
5766         h->chroma_pred_mode_table[mb_xy] = 0;
5767         // In deblocking, the quantizer is 0
5768         s->current_picture.qscale_table[mb_xy]= 0;
5769         // All coeffs are present
5770         memset(h->non_zero_count[mb_xy], 16, 16);
5771         s->current_picture.mb_type[mb_xy]= mb_type;
5772         h->last_qscale_diff = 0;
5773         return 0;
5774     }
5775
5776     if(MB_MBAFF){
5777         h->ref_count[0] <<= 1;
5778         h->ref_count[1] <<= 1;
5779     }
5780
5781     fill_caches(h, mb_type, 0);
5782
5783     if( IS_INTRA( mb_type ) ) {
5784         int i, pred_mode;
5785         if( IS_INTRA4x4( mb_type ) ) {
5786             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5787                 mb_type |= MB_TYPE_8x8DCT;
5788                 for( i = 0; i < 16; i+=4 ) {
5789                     int pred = pred_intra_mode( h, i );
5790                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5791                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5792                 }
5793             } else {
5794                 for( i = 0; i < 16; i++ ) {
5795                     int pred = pred_intra_mode( h, i );
5796                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5797
5798                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5799                 }
5800             }
5801             write_back_intra_pred_mode(h);
5802             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5803         } else {
5804             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5805             if( h->intra16x16_pred_mode < 0 ) return -1;
5806         }
5807         h->chroma_pred_mode_table[mb_xy] =
5808         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5809
5810         pred_mode= check_intra_pred_mode( h, pred_mode );
5811         if( pred_mode < 0 ) return -1;
5812         h->chroma_pred_mode= pred_mode;
5813     } else if( partition_count == 4 ) {
5814         int i, j, sub_partition_count[4], list, ref[2][4];
5815
5816         if( h->slice_type_nos == FF_B_TYPE ) {
5817             for( i = 0; i < 4; i++ ) {
5818                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5819                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5820                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5821             }
5822             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5823                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5824                 pred_direct_motion(h, &mb_type);
5825                 h->ref_cache[0][scan8[4]] =
5826                 h->ref_cache[1][scan8[4]] =
5827                 h->ref_cache[0][scan8[12]] =
5828                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5829                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5830                     for( i = 0; i < 4; i++ )
5831                         if( IS_DIRECT(h->sub_mb_type[i]) )
5832                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5833                 }
5834             }
5835         } else {
5836             for( i = 0; i < 4; i++ ) {
5837                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5838                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5839                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5840             }
5841         }
5842
5843         for( list = 0; list < h->list_count; list++ ) {
5844                 for( i = 0; i < 4; i++ ) {
5845                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5846                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5847                         if( h->ref_count[list] > 1 )
5848                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5849                         else
5850                             ref[list][i] = 0;
5851                     } else {
5852                         ref[list][i] = -1;
5853                     }
5854                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5855                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5856                 }
5857         }
5858
5859         if(dct8x8_allowed)
5860             dct8x8_allowed = get_dct8x8_allowed(h);
5861
5862         for(list=0; list<h->list_count; list++){
5863             for(i=0; i<4; i++){
5864                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5865                 if(IS_DIRECT(h->sub_mb_type[i])){
5866                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5867                     continue;
5868                 }
5869
5870                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5871                     const int sub_mb_type= h->sub_mb_type[i];
5872                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5873                     for(j=0; j<sub_partition_count[i]; j++){
5874                         int mpx, mpy;
5875                         int mx, my;
5876                         const int index= 4*i + block_width*j;
5877                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5878                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5879                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5880
5881                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5882                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5883                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5884
5885                         if(IS_SUB_8X8(sub_mb_type)){
5886                             mv_cache[ 1 ][0]=
5887                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5888                             mv_cache[ 1 ][1]=
5889                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5890
5891                             mvd_cache[ 1 ][0]=
5892                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5893                             mvd_cache[ 1 ][1]=
5894                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5895                         }else if(IS_SUB_8X4(sub_mb_type)){
5896                             mv_cache[ 1 ][0]= mx;
5897                             mv_cache[ 1 ][1]= my;
5898
5899                             mvd_cache[ 1 ][0]= mx - mpx;
5900                             mvd_cache[ 1 ][1]= my - mpy;
5901                         }else if(IS_SUB_4X8(sub_mb_type)){
5902                             mv_cache[ 8 ][0]= mx;
5903                             mv_cache[ 8 ][1]= my;
5904
5905                             mvd_cache[ 8 ][0]= mx - mpx;
5906                             mvd_cache[ 8 ][1]= my - mpy;
5907                         }
5908                         mv_cache[ 0 ][0]= mx;
5909                         mv_cache[ 0 ][1]= my;
5910
5911                         mvd_cache[ 0 ][0]= mx - mpx;
5912                         mvd_cache[ 0 ][1]= my - mpy;
5913                     }
5914                 }else{
5915                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5916                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5917                     p[0] = p[1] = p[8] = p[9] = 0;
5918                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5919                 }
5920             }
5921         }
5922     } else if( IS_DIRECT(mb_type) ) {
5923         pred_direct_motion(h, &mb_type);
5924         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5925         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5926         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5927     } else {
5928         int list, mx, my, i, mpx, mpy;
5929         if(IS_16X16(mb_type)){
5930             for(list=0; list<h->list_count; list++){
5931                 if(IS_DIR(mb_type, 0, list)){
5932                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5933                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5934                 }else
5935                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5936             }
5937             for(list=0; list<h->list_count; list++){
5938                 if(IS_DIR(mb_type, 0, list)){
5939                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5940
5941                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5942                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5943                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5944
5945                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5946                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5947                 }else
5948                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5949             }
5950         }
5951         else if(IS_16X8(mb_type)){
5952             for(list=0; list<h->list_count; list++){
5953                     for(i=0; i<2; i++){
5954                         if(IS_DIR(mb_type, i, list)){
5955                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5956                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5957                         }else
5958                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5959                     }
5960             }
5961             for(list=0; list<h->list_count; list++){
5962                 for(i=0; i<2; i++){
5963                     if(IS_DIR(mb_type, i, list)){
5964                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5965                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5966                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5967                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5968
5969                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5970                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5971                     }else{
5972                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5973                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5974                     }
5975                 }
5976             }
5977         }else{
5978             assert(IS_8X16(mb_type));
5979             for(list=0; list<h->list_count; list++){
5980                     for(i=0; i<2; i++){
5981                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5982                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5983                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5984                         }else
5985                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5986                     }
5987             }
5988             for(list=0; list<h->list_count; list++){
5989                 for(i=0; i<2; i++){
5990                     if(IS_DIR(mb_type, i, list)){
5991                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5992                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5993                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5994
5995                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5996                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5997                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5998                     }else{
5999                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6000                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6001                     }
6002                 }
6003             }
6004         }
6005     }
6006
6007    if( IS_INTER( mb_type ) ) {
6008         h->chroma_pred_mode_table[mb_xy] = 0;
6009         write_back_motion( h, mb_type );
6010    }
6011
6012     if( !IS_INTRA16x16( mb_type ) ) {
6013         cbp  = decode_cabac_mb_cbp_luma( h );
6014         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6015     }
6016
6017     h->cbp_table[mb_xy] = h->cbp = cbp;
6018
6019     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6020         if( decode_cabac_mb_transform_size( h ) )
6021             mb_type |= MB_TYPE_8x8DCT;
6022     }
6023     s->current_picture.mb_type[mb_xy]= mb_type;
6024
6025     if( cbp || IS_INTRA16x16( mb_type ) ) {
6026         const uint8_t *scan, *scan8x8, *dc_scan;
6027         const uint32_t *qmul;
6028         int dqp;
6029
6030         if(IS_INTERLACED(mb_type)){
6031             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6032             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6033             dc_scan= luma_dc_field_scan;
6034         }else{
6035             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6036             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6037             dc_scan= luma_dc_zigzag_scan;
6038         }
6039
6040         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6041         if( dqp == INT_MIN ){
6042             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6043             return -1;
6044         }
6045         s->qscale += dqp;
6046         if(((unsigned)s->qscale) > 51){
6047             if(s->qscale<0) s->qscale+= 52;
6048             else            s->qscale-= 52;
6049         }
6050         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6051         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6052
6053         if( IS_INTRA16x16( mb_type ) ) {
6054             int i;
6055             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6056             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6057
6058             if( cbp&15 ) {
6059                 qmul = h->dequant4_coeff[0][s->qscale];
6060                 for( i = 0; i < 16; i++ ) {
6061                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6062                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6063                 }
6064             } else {
6065                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6066             }
6067         } else {
6068             int i8x8, i4x4;
6069             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6070                 if( cbp & (1<<i8x8) ) {
6071                     if( IS_8x8DCT(mb_type) ) {
6072                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6073                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6074                     } else {
6075                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6076                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6077                             const int index = 4*i8x8 + i4x4;
6078                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6079 //START_TIMER
6080                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6081 //STOP_TIMER("decode_residual")
6082                         }
6083                     }
6084                 } else {
6085                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6086                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6087                 }
6088             }
6089         }
6090
6091         if( cbp&0x30 ){
6092             int c;
6093             for( c = 0; c < 2; c++ ) {
6094                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6095                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6096             }
6097         }
6098
6099         if( cbp&0x20 ) {
6100             int c, i;
6101             for( c = 0; c < 2; c++ ) {
6102                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6103                 for( i = 0; i < 4; i++ ) {
6104                     const int index = 16 + 4 * c + i;
6105                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6106                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6107                 }
6108             }
6109         } else {
6110             uint8_t * const nnz= &h->non_zero_count_cache[0];
6111             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6112             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6113         }
6114     } else {
6115         uint8_t * const nnz= &h->non_zero_count_cache[0];
6116         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6117         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6118         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6119         h->last_qscale_diff = 0;
6120     }
6121
6122     s->current_picture.qscale_table[mb_xy]= s->qscale;
6123     write_back_non_zero_count(h);
6124
6125     if(MB_MBAFF){
6126         h->ref_count[0] >>= 1;
6127         h->ref_count[1] >>= 1;
6128     }
6129
6130     return 0;
6131 }
6132
6133
6134 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6135     int i, d;
6136     const int index_a = qp + h->slice_alpha_c0_offset;
6137     const int alpha = (alpha_table+52)[index_a];
6138     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6139
6140     if( bS[0] < 4 ) {
6141         int8_t tc[4];
6142         for(i=0; i<4; i++)
6143             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6144         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6145     } else {
6146         /* 16px edge length, because bS=4 is triggered by being at
6147          * the edge of an intra MB, so all 4 bS are the same */
6148             for( d = 0; d < 16; d++ ) {
6149                 const int p0 = pix[-1];
6150                 const int p1 = pix[-2];
6151                 const int p2 = pix[-3];
6152
6153                 const int q0 = pix[0];
6154                 const int q1 = pix[1];
6155                 const int q2 = pix[2];
6156
6157                 if( FFABS( p0 - q0 ) < alpha &&
6158                     FFABS( p1 - p0 ) < beta &&
6159                     FFABS( q1 - q0 ) < beta ) {
6160
6161                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6162                         if( FFABS( p2 - p0 ) < beta)
6163                         {
6164                             const int p3 = pix[-4];
6165                             /* p0', p1', p2' */
6166                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6167                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6168                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6169                         } else {
6170                             /* p0' */
6171                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6172                         }
6173                         if( FFABS( q2 - q0 ) < beta)
6174                         {
6175                             const int q3 = pix[3];
6176                             /* q0', q1', q2' */
6177                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6178                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6179                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6180                         } else {
6181                             /* q0' */
6182                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6183                         }
6184                     }else{
6185                         /* p0', q0' */
6186                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6187                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6188                     }
6189                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6190                 }
6191                 pix += stride;
6192             }
6193     }
6194 }
6195 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6196     int i;
6197     const int index_a = qp + h->slice_alpha_c0_offset;
6198     const int alpha = (alpha_table+52)[index_a];
6199     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6200
6201     if( bS[0] < 4 ) {
6202         int8_t tc[4];
6203         for(i=0; i<4; i++)
6204             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6205         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6206     } else {
6207         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6208     }
6209 }
6210
6211 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6212     int i;
6213     for( i = 0; i < 16; i++, pix += stride) {
6214         int index_a;
6215         int alpha;
6216         int beta;
6217
6218         int qp_index;
6219         int bS_index = (i >> 1);
6220         if (!MB_FIELD) {
6221             bS_index &= ~1;
6222             bS_index |= (i & 1);
6223         }
6224
6225         if( bS[bS_index] == 0 ) {
6226             continue;
6227         }
6228
6229         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6230         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6231         alpha = (alpha_table+52)[index_a];
6232         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6233
6234         if( bS[bS_index] < 4 ) {
6235             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6236             const int p0 = pix[-1];
6237             const int p1 = pix[-2];
6238             const int p2 = pix[-3];
6239             const int q0 = pix[0];
6240             const int q1 = pix[1];
6241             const int q2 = pix[2];
6242
6243             if( FFABS( p0 - q0 ) < alpha &&
6244                 FFABS( p1 - p0 ) < beta &&
6245                 FFABS( q1 - q0 ) < beta ) {
6246                 int tc = tc0;
6247                 int i_delta;
6248
6249                 if( FFABS( p2 - p0 ) < beta ) {
6250                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6251                     tc++;
6252                 }
6253                 if( FFABS( q2 - q0 ) < beta ) {
6254                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6255                     tc++;
6256                 }
6257
6258                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6259                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6260                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6261                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6262             }
6263         }else{
6264             const int p0 = pix[-1];
6265             const int p1 = pix[-2];
6266             const int p2 = pix[-3];
6267
6268             const int q0 = pix[0];
6269             const int q1 = pix[1];
6270             const int q2 = pix[2];
6271
6272             if( FFABS( p0 - q0 ) < alpha &&
6273                 FFABS( p1 - p0 ) < beta &&
6274                 FFABS( q1 - q0 ) < beta ) {
6275
6276                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6277                     if( FFABS( p2 - p0 ) < beta)
6278                     {
6279                         const int p3 = pix[-4];
6280                         /* p0', p1', p2' */
6281                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6282                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6283                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6284                     } else {
6285                         /* p0' */
6286                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6287                     }
6288                     if( FFABS( q2 - q0 ) < beta)
6289                     {
6290                         const int q3 = pix[3];
6291                         /* q0', q1', q2' */
6292                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6293                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6294                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6295                     } else {
6296                         /* q0' */
6297                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6298                     }
6299                 }else{
6300                     /* p0', q0' */
6301                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6302                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6303                 }
6304                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6305             }
6306         }
6307     }
6308 }
6309 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6310     int i;
6311     for( i = 0; i < 8; i++, pix += stride) {
6312         int index_a;
6313         int alpha;
6314         int beta;
6315
6316         int qp_index;
6317         int bS_index = i;
6318
6319         if( bS[bS_index] == 0 ) {
6320             continue;
6321         }
6322
6323         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6324         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6325         alpha = (alpha_table+52)[index_a];
6326         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6327
6328         if( bS[bS_index] < 4 ) {
6329             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6330             const int p0 = pix[-1];
6331             const int p1 = pix[-2];
6332             const int q0 = pix[0];
6333             const int q1 = pix[1];
6334
6335             if( FFABS( p0 - q0 ) < alpha &&
6336                 FFABS( p1 - p0 ) < beta &&
6337                 FFABS( q1 - q0 ) < beta ) {
6338                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6339
6340                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6341                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6342                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6343             }
6344         }else{
6345             const int p0 = pix[-1];
6346             const int p1 = pix[-2];
6347             const int q0 = pix[0];
6348             const int q1 = pix[1];
6349
6350             if( FFABS( p0 - q0 ) < alpha &&
6351                 FFABS( p1 - p0 ) < beta &&
6352                 FFABS( q1 - q0 ) < beta ) {
6353
6354                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6355                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6356                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6357             }
6358         }
6359     }
6360 }
6361
6362 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6363     int i, d;
6364     const int index_a = qp + h->slice_alpha_c0_offset;
6365     const int alpha = (alpha_table+52)[index_a];
6366     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6367     const int pix_next  = stride;
6368
6369     if( bS[0] < 4 ) {
6370         int8_t tc[4];
6371         for(i=0; i<4; i++)
6372             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6373         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6374     } else {
6375         /* 16px edge length, see filter_mb_edgev */
6376             for( d = 0; d < 16; d++ ) {
6377                 const int p0 = pix[-1*pix_next];
6378                 const int p1 = pix[-2*pix_next];
6379                 const int p2 = pix[-3*pix_next];
6380                 const int q0 = pix[0];
6381                 const int q1 = pix[1*pix_next];
6382                 const int q2 = pix[2*pix_next];
6383
6384                 if( FFABS( p0 - q0 ) < alpha &&
6385                     FFABS( p1 - p0 ) < beta &&
6386                     FFABS( q1 - q0 ) < beta ) {
6387
6388                     const int p3 = pix[-4*pix_next];
6389                     const int q3 = pix[ 3*pix_next];
6390
6391                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6392                         if( FFABS( p2 - p0 ) < beta) {
6393                             /* p0', p1', p2' */
6394                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6395                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6396                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6397                         } else {
6398                             /* p0' */
6399                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6400                         }
6401                         if( FFABS( q2 - q0 ) < beta) {
6402                             /* q0', q1', q2' */
6403                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6404                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6405                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6406                         } else {
6407                             /* q0' */
6408                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6409                         }
6410                     }else{
6411                         /* p0', q0' */
6412                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6413                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6414                     }
6415                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6416                 }
6417                 pix++;
6418             }
6419     }
6420 }
6421
6422 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6423     int i;
6424     const int index_a = qp + h->slice_alpha_c0_offset;
6425     const int alpha = (alpha_table+52)[index_a];
6426     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6427
6428     if( bS[0] < 4 ) {
6429         int8_t tc[4];
6430         for(i=0; i<4; i++)
6431             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6432         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6433     } else {
6434         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6435     }
6436 }
6437
6438 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6439     MpegEncContext * const s = &h->s;
6440     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6441     int mb_xy, mb_type;
6442     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6443
6444     mb_xy = h->mb_xy;
6445
6446     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6447 1 ||
6448        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6449                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6450         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6451         return;
6452     }
6453     assert(!FRAME_MBAFF);
6454
6455     mb_type = s->current_picture.mb_type[mb_xy];
6456     qp = s->current_picture.qscale_table[mb_xy];
6457     qp0 = s->current_picture.qscale_table[mb_xy-1];
6458     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6459     qpc = get_chroma_qp( h, 0, qp );
6460     qpc0 = get_chroma_qp( h, 0, qp0 );
6461     qpc1 = get_chroma_qp( h, 0, qp1 );
6462     qp0 = (qp + qp0 + 1) >> 1;
6463     qp1 = (qp + qp1 + 1) >> 1;
6464     qpc0 = (qpc + qpc0 + 1) >> 1;
6465     qpc1 = (qpc + qpc1 + 1) >> 1;
6466     qp_thresh = 15 - h->slice_alpha_c0_offset;
6467     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6468        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6469         return;
6470
6471     if( IS_INTRA(mb_type) ) {
6472         int16_t bS4[4] = {4,4,4,4};
6473         int16_t bS3[4] = {3,3,3,3};
6474         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6475         if( IS_8x8DCT(mb_type) ) {
6476             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6477             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6478             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6479             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6480         } else {
6481             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6482             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6483             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6484             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6485             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6486             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6487             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6488             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6489         }
6490         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6491         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6492         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6493         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6494         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6495         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6496         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6497         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6498         return;
6499     } else {
6500         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6501         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6502         int edges;
6503         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6504             edges = 4;
6505             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6506         } else {
6507             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6508                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6509             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6510                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6511                              ? 3 : 0;
6512             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6513             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6514             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6515                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6516         }
6517         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6518             bSv[0][0] = 0x0004000400040004ULL;
6519         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6520             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6521
6522 #define FILTER(hv,dir,edge)\
6523         if(bSv[dir][edge]) {\
6524             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6525             if(!(edge&1)) {\
6526                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6527                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6528             }\
6529         }
6530         if( edges == 1 ) {
6531             FILTER(v,0,0);
6532             FILTER(h,1,0);
6533         } else if( IS_8x8DCT(mb_type) ) {
6534             FILTER(v,0,0);
6535             FILTER(v,0,2);
6536             FILTER(h,1,0);
6537             FILTER(h,1,2);
6538         } else {
6539             FILTER(v,0,0);
6540             FILTER(v,0,1);
6541             FILTER(v,0,2);
6542             FILTER(v,0,3);
6543             FILTER(h,1,0);
6544             FILTER(h,1,1);
6545             FILTER(h,1,2);
6546             FILTER(h,1,3);
6547         }
6548 #undef FILTER
6549     }
6550 }
6551
6552 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6553     MpegEncContext * const s = &h->s;
6554     const int mb_xy= mb_x + mb_y*s->mb_stride;
6555     const int mb_type = s->current_picture.mb_type[mb_xy];
6556     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6557     int first_vertical_edge_done = 0;
6558     int dir;
6559
6560     //for sufficiently low qp, filtering wouldn't do anything
6561     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6562     if(!FRAME_MBAFF){
6563         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6564         int qp = s->current_picture.qscale_table[mb_xy];
6565         if(qp <= qp_thresh
6566            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6567            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6568             return;
6569         }
6570     }
6571
6572     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6573     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6574         int top_type, left_type[2];
6575         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6576         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6577         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6578
6579         if(IS_8x8DCT(top_type)){
6580             h->non_zero_count_cache[4+8*0]=
6581             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6582             h->non_zero_count_cache[6+8*0]=
6583             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6584         }
6585         if(IS_8x8DCT(left_type[0])){
6586             h->non_zero_count_cache[3+8*1]=
6587             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6588         }
6589         if(IS_8x8DCT(left_type[1])){
6590             h->non_zero_count_cache[3+8*3]=
6591             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6592         }
6593
6594         if(IS_8x8DCT(mb_type)){
6595             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6596             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6597
6598             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6599             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6600
6601             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6602             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6603
6604             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6605             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6606         }
6607     }
6608
6609     if (FRAME_MBAFF
6610             // left mb is in picture
6611             && h->slice_table[mb_xy-1] != 255
6612             // and current and left pair do not have the same interlaced type
6613             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6614             // and left mb is in the same slice if deblocking_filter == 2
6615             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6616         /* First vertical edge is different in MBAFF frames
6617          * There are 8 different bS to compute and 2 different Qp
6618          */
6619         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6620         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6621         int16_t bS[8];
6622         int qp[2];
6623         int bqp[2];
6624         int rqp[2];
6625         int mb_qp, mbn0_qp, mbn1_qp;
6626         int i;
6627         first_vertical_edge_done = 1;
6628
6629         if( IS_INTRA(mb_type) )
6630             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6631         else {
6632             for( i = 0; i < 8; i++ ) {
6633                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6634
6635                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6636                     bS[i] = 4;
6637                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6638                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6639                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6640                     bS[i] = 2;
6641                 else
6642                     bS[i] = 1;
6643             }
6644         }
6645
6646         mb_qp = s->current_picture.qscale_table[mb_xy];
6647         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6648         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6649         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6650         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6651                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6652         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6653                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6654         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6655         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6656                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6657         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6658                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6659
6660         /* Filter edge */
6661         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6662         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6663         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6664         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6665         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6666     }
6667     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6668     for( dir = 0; dir < 2; dir++ )
6669     {
6670         int edge;
6671         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6672         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6673         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6674         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6675         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6676
6677         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6678                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6679         // how often to recheck mv-based bS when iterating between edges
6680         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6681                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6682         // how often to recheck mv-based bS when iterating along each edge
6683         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6684
6685         if (first_vertical_edge_done) {
6686             start = 1;
6687             first_vertical_edge_done = 0;
6688         }
6689
6690         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6691             start = 1;
6692
6693         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6694             && !IS_INTERLACED(mb_type)
6695             && IS_INTERLACED(mbm_type)
6696             ) {
6697             // This is a special case in the norm where the filtering must
6698             // be done twice (one each of the field) even if we are in a
6699             // frame macroblock.
6700             //
6701             static const int nnz_idx[4] = {4,5,6,3};
6702             unsigned int tmp_linesize   = 2 *   linesize;
6703             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6704             int mbn_xy = mb_xy - 2 * s->mb_stride;
6705             int qp;
6706             int i, j;
6707             int16_t bS[4];
6708
6709             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6710                 if( IS_INTRA(mb_type) ||
6711                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6712                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6713                 } else {
6714                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6715                     for( i = 0; i < 4; i++ ) {
6716                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6717                             mbn_nnz[nnz_idx[i]] != 0 )
6718                             bS[i] = 2;
6719                         else
6720                             bS[i] = 1;
6721                     }
6722                 }
6723                 // Do not use s->qscale as luma quantizer because it has not the same
6724                 // value in IPCM macroblocks.
6725                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6726                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6727                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6728                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6729                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6730                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6731                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6732                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6733             }
6734
6735             start = 1;
6736         }
6737
6738         /* Calculate bS */
6739         for( edge = start; edge < edges; edge++ ) {
6740             /* mbn_xy: neighbor macroblock */
6741             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6742             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6743             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6744             int16_t bS[4];
6745             int qp;
6746
6747             if( (edge&1) && IS_8x8DCT(mb_type) )
6748                 continue;
6749
6750             if( IS_INTRA(mb_type) ||
6751                 IS_INTRA(mbn_type) ) {
6752                 int value;
6753                 if (edge == 0) {
6754                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6755                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6756                     ) {
6757                         value = 4;
6758                     } else {
6759                         value = 3;
6760                     }
6761                 } else {
6762                     value = 3;
6763                 }
6764                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6765             } else {
6766                 int i, l;
6767                 int mv_done;
6768
6769                 if( edge & mask_edge ) {
6770                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6771                     mv_done = 1;
6772                 }
6773                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6774                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6775                     mv_done = 1;
6776                 }
6777                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6778                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6779                     int bn_idx= b_idx - (dir ? 8:1);
6780                     int v = 0;
6781
6782                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6783                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6784                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6785                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6786                     }
6787
6788                     if(h->slice_type_nos == FF_B_TYPE && v){
6789                         v=0;
6790                         for( l = 0; !v && l < 2; l++ ) {
6791                             int ln= 1-l;
6792                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6793                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6794                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6795                         }
6796                     }
6797
6798                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6799                     mv_done = 1;
6800                 }
6801                 else
6802                     mv_done = 0;
6803
6804                 for( i = 0; i < 4; i++ ) {
6805                     int x = dir == 0 ? edge : i;
6806                     int y = dir == 0 ? i    : edge;
6807                     int b_idx= 8 + 4 + x + 8*y;
6808                     int bn_idx= b_idx - (dir ? 8:1);
6809
6810                     if( h->non_zero_count_cache[b_idx] != 0 ||
6811                         h->non_zero_count_cache[bn_idx] != 0 ) {
6812                         bS[i] = 2;
6813                     }
6814                     else if(!mv_done)
6815                     {
6816                         bS[i] = 0;
6817                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6818                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6819                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6820                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6821                                 bS[i] = 1;
6822                                 break;
6823                             }
6824                         }
6825
6826                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6827                             bS[i] = 0;
6828                             for( l = 0; l < 2; l++ ) {
6829                                 int ln= 1-l;
6830                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6831                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6832                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6833                                     bS[i] = 1;
6834                                     break;
6835                                 }
6836                             }
6837                         }
6838                     }
6839                 }
6840
6841                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6842                     continue;
6843             }
6844
6845             /* Filter edge */
6846             // Do not use s->qscale as luma quantizer because it has not the same
6847             // value in IPCM macroblocks.
6848             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6849             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6850             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6851             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6852             if( dir == 0 ) {
6853                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6854                 if( (edge&1) == 0 ) {
6855                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6856                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6857                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6858                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6859                 }
6860             } else {
6861                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6862                 if( (edge&1) == 0 ) {
6863                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6864                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6865                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6866                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6867                 }
6868             }
6869         }
6870     }
6871 }
6872
6873 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6874     MpegEncContext * const s = &h->s;
6875     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6876
6877     s->mb_skip_run= -1;
6878
6879     if( h->pps.cabac ) {
6880         int i;
6881
6882         /* realign */
6883         align_get_bits( &s->gb );
6884
6885         /* init cabac */
6886         ff_init_cabac_states( &h->cabac);
6887         ff_init_cabac_decoder( &h->cabac,
6888                                s->gb.buffer + get_bits_count(&s->gb)/8,
6889                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6890         /* calculate pre-state */
6891         for( i= 0; i < 460; i++ ) {
6892             int pre;
6893             if( h->slice_type_nos == FF_I_TYPE )
6894                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6895             else
6896                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6897
6898             if( pre <= 63 )
6899                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6900             else
6901                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6902         }
6903
6904         for(;;){
6905 //START_TIMER
6906             int ret = decode_mb_cabac(h);
6907             int eos;
6908 //STOP_TIMER("decode_mb_cabac")
6909
6910             if(ret>=0) hl_decode_mb(h);
6911
6912             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6913                 s->mb_y++;
6914
6915                 if(ret>=0) ret = decode_mb_cabac(h);
6916
6917                 if(ret>=0) hl_decode_mb(h);
6918                 s->mb_y--;
6919             }
6920             eos = get_cabac_terminate( &h->cabac );
6921
6922             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6923                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6924                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6925                 return -1;
6926             }
6927
6928             if( ++s->mb_x >= s->mb_width ) {
6929                 s->mb_x = 0;
6930                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6931                 ++s->mb_y;
6932                 if(FIELD_OR_MBAFF_PICTURE) {
6933                     ++s->mb_y;
6934                 }
6935             }
6936
6937             if( eos || s->mb_y >= s->mb_height ) {
6938                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6939                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6940                 return 0;
6941             }
6942         }
6943
6944     } else {
6945         for(;;){
6946             int ret = decode_mb_cavlc(h);
6947
6948             if(ret>=0) hl_decode_mb(h);
6949
6950             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6951                 s->mb_y++;
6952                 ret = decode_mb_cavlc(h);
6953
6954                 if(ret>=0) hl_decode_mb(h);
6955                 s->mb_y--;
6956             }
6957
6958             if(ret<0){
6959                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6960                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6961
6962                 return -1;
6963             }
6964
6965             if(++s->mb_x >= s->mb_width){
6966                 s->mb_x=0;
6967                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6968                 ++s->mb_y;
6969                 if(FIELD_OR_MBAFF_PICTURE) {
6970                     ++s->mb_y;
6971                 }
6972                 if(s->mb_y >= s->mb_height){
6973                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6974
6975                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6976                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6977
6978                         return 0;
6979                     }else{
6980                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6981
6982                         return -1;
6983                     }
6984                 }
6985             }
6986
6987             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6988                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6989                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6990                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6991
6992                     return 0;
6993                 }else{
6994                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6995
6996                     return -1;
6997                 }
6998             }
6999         }
7000     }
7001
7002 #if 0
7003     for(;s->mb_y < s->mb_height; s->mb_y++){
7004         for(;s->mb_x < s->mb_width; s->mb_x++){
7005             int ret= decode_mb(h);
7006
7007             hl_decode_mb(h);
7008
7009             if(ret<0){
7010                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7011                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7012
7013                 return -1;
7014             }
7015
7016             if(++s->mb_x >= s->mb_width){
7017                 s->mb_x=0;
7018                 if(++s->mb_y >= s->mb_height){
7019                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7020                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7021
7022                         return 0;
7023                     }else{
7024                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7025
7026                         return -1;
7027                     }
7028                 }
7029             }
7030
7031             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7032                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7033                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7034
7035                     return 0;
7036                 }else{
7037                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7038
7039                     return -1;
7040                 }
7041             }
7042         }
7043         s->mb_x=0;
7044         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7045     }
7046 #endif
7047     return -1; //not reached
7048 }
7049
7050 static int decode_unregistered_user_data(H264Context *h, int size){
7051     MpegEncContext * const s = &h->s;
7052     uint8_t user_data[16+256];
7053     int e, build, i;
7054
7055     if(size<16)
7056         return -1;
7057
7058     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7059         user_data[i]= get_bits(&s->gb, 8);
7060     }
7061
7062     user_data[i]= 0;
7063     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7064     if(e==1 && build>=0)
7065         h->x264_build= build;
7066
7067     if(s->avctx->debug & FF_DEBUG_BUGS)
7068         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7069
7070     for(; i<size; i++)
7071         skip_bits(&s->gb, 8);
7072
7073     return 0;
7074 }
7075
7076 static int decode_sei(H264Context *h){
7077     MpegEncContext * const s = &h->s;
7078
7079     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7080         int size, type;
7081
7082         type=0;
7083         do{
7084             type+= show_bits(&s->gb, 8);
7085         }while(get_bits(&s->gb, 8) == 255);
7086
7087         size=0;
7088         do{
7089             size+= show_bits(&s->gb, 8);
7090         }while(get_bits(&s->gb, 8) == 255);
7091
7092         switch(type){
7093         case 5:
7094             if(decode_unregistered_user_data(h, size) < 0)
7095                 return -1;
7096             break;
7097         default:
7098             skip_bits(&s->gb, 8*size);
7099         }
7100
7101         //FIXME check bits here
7102         align_get_bits(&s->gb);
7103     }
7104
7105     return 0;
7106 }
7107
7108 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7109     MpegEncContext * const s = &h->s;
7110     int cpb_count, i;
7111     cpb_count = get_ue_golomb(&s->gb) + 1;
7112     get_bits(&s->gb, 4); /* bit_rate_scale */
7113     get_bits(&s->gb, 4); /* cpb_size_scale */
7114     for(i=0; i<cpb_count; i++){
7115         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7116         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7117         get_bits1(&s->gb);     /* cbr_flag */
7118     }
7119     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7120     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7121     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7122     get_bits(&s->gb, 5); /* time_offset_length */
7123 }
7124
7125 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7126     MpegEncContext * const s = &h->s;
7127     int aspect_ratio_info_present_flag;
7128     unsigned int aspect_ratio_idc;
7129     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7130
7131     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7132
7133     if( aspect_ratio_info_present_flag ) {
7134         aspect_ratio_idc= get_bits(&s->gb, 8);
7135         if( aspect_ratio_idc == EXTENDED_SAR ) {
7136             sps->sar.num= get_bits(&s->gb, 16);
7137             sps->sar.den= get_bits(&s->gb, 16);
7138         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7139             sps->sar=  pixel_aspect[aspect_ratio_idc];
7140         }else{
7141             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7142             return -1;
7143         }
7144     }else{
7145         sps->sar.num=
7146         sps->sar.den= 0;
7147     }
7148 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7149
7150     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7151         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7152     }
7153
7154     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7155         get_bits(&s->gb, 3);    /* video_format */
7156         get_bits1(&s->gb);      /* video_full_range_flag */
7157         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7158             get_bits(&s->gb, 8); /* colour_primaries */
7159             get_bits(&s->gb, 8); /* transfer_characteristics */
7160             get_bits(&s->gb, 8); /* matrix_coefficients */
7161         }
7162     }
7163
7164     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7165         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7166         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7167     }
7168
7169     sps->timing_info_present_flag = get_bits1(&s->gb);
7170     if(sps->timing_info_present_flag){
7171         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7172         sps->time_scale = get_bits_long(&s->gb, 32);
7173         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7174     }
7175
7176     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7177     if(nal_hrd_parameters_present_flag)
7178         decode_hrd_parameters(h, sps);
7179     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7180     if(vcl_hrd_parameters_present_flag)
7181         decode_hrd_parameters(h, sps);
7182     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7183         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7184     get_bits1(&s->gb);         /* pic_struct_present_flag */
7185
7186     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7187     if(sps->bitstream_restriction_flag){
7188         unsigned int num_reorder_frames;
7189         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7190         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7191         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7192         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7193         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7194         num_reorder_frames= get_ue_golomb(&s->gb);
7195         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7196
7197         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7198             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7199             return -1;
7200         }
7201
7202         sps->num_reorder_frames= num_reorder_frames;
7203     }
7204
7205     return 0;
7206 }
7207
7208 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7209                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7210     MpegEncContext * const s = &h->s;
7211     int i, last = 8, next = 8;
7212     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7213     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7214         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7215     else
7216     for(i=0;i<size;i++){
7217         if(next)
7218             next = (last + get_se_golomb(&s->gb)) & 0xff;
7219         if(!i && !next){ /* matrix not written, we use the preset one */
7220             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7221             break;
7222         }
7223         last = factors[scan[i]] = next ? next : last;
7224     }
7225 }
7226
7227 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7228                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7229     MpegEncContext * const s = &h->s;
7230     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7231     const uint8_t *fallback[4] = {
7232         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7233         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7234         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7235         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7236     };
7237     if(get_bits1(&s->gb)){
7238         sps->scaling_matrix_present |= is_sps;
7239         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7240         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7241         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7242         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7243         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7244         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7245         if(is_sps || pps->transform_8x8_mode){
7246             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7247             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7248         }
7249     } else if(fallback_sps) {
7250         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7251         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7252     }
7253 }
7254
7255 /**
7256  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7257  */
7258 static void *
7259 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7260                     const size_t size, const char *name)
7261 {
7262     if(id>=max) {
7263         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7264         return NULL;
7265     }
7266
7267     if(!vec[id]) {
7268         vec[id] = av_mallocz(size);
7269         if(vec[id] == NULL)
7270             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7271     }
7272     return vec[id];
7273 }
7274
7275 static inline int decode_seq_parameter_set(H264Context *h){
7276     MpegEncContext * const s = &h->s;
7277     int profile_idc, level_idc;
7278     unsigned int sps_id, tmp, mb_width, mb_height;
7279     int i;
7280     SPS *sps;
7281
7282     profile_idc= get_bits(&s->gb, 8);
7283     get_bits1(&s->gb);   //constraint_set0_flag
7284     get_bits1(&s->gb);   //constraint_set1_flag
7285     get_bits1(&s->gb);   //constraint_set2_flag
7286     get_bits1(&s->gb);   //constraint_set3_flag
7287     get_bits(&s->gb, 4); // reserved
7288     level_idc= get_bits(&s->gb, 8);
7289     sps_id= get_ue_golomb(&s->gb);
7290
7291     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7292     if(sps == NULL)
7293         return -1;
7294
7295     sps->profile_idc= profile_idc;
7296     sps->level_idc= level_idc;
7297
7298     if(sps->profile_idc >= 100){ //high profile
7299         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7300             get_bits1(&s->gb);  //residual_color_transform_flag
7301         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7302         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7303         sps->transform_bypass = get_bits1(&s->gb);
7304         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7305     }else
7306         sps->scaling_matrix_present = 0;
7307
7308     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7309     sps->poc_type= get_ue_golomb(&s->gb);
7310
7311     if(sps->poc_type == 0){ //FIXME #define
7312         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7313     } else if(sps->poc_type == 1){//FIXME #define
7314         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7315         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7316         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7317         tmp= get_ue_golomb(&s->gb);
7318
7319         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7320             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7321             return -1;
7322         }
7323         sps->poc_cycle_length= tmp;
7324
7325         for(i=0; i<sps->poc_cycle_length; i++)
7326             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7327     }else if(sps->poc_type != 2){
7328         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7329         return -1;
7330     }
7331
7332     tmp= get_ue_golomb(&s->gb);
7333     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7334         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7335         return -1;
7336     }
7337     sps->ref_frame_count= tmp;
7338     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7339     mb_width= get_ue_golomb(&s->gb) + 1;
7340     mb_height= get_ue_golomb(&s->gb) + 1;
7341     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7342        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7343         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7344         return -1;
7345     }
7346     sps->mb_width = mb_width;
7347     sps->mb_height= mb_height;
7348
7349     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7350     if(!sps->frame_mbs_only_flag)
7351         sps->mb_aff= get_bits1(&s->gb);
7352     else
7353         sps->mb_aff= 0;
7354
7355     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7356
7357 #ifndef ALLOW_INTERLACE
7358     if(sps->mb_aff)
7359         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7360 #endif
7361     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7362         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7363
7364     sps->crop= get_bits1(&s->gb);
7365     if(sps->crop){
7366         sps->crop_left  = get_ue_golomb(&s->gb);
7367         sps->crop_right = get_ue_golomb(&s->gb);
7368         sps->crop_top   = get_ue_golomb(&s->gb);
7369         sps->crop_bottom= get_ue_golomb(&s->gb);
7370         if(sps->crop_left || sps->crop_top){
7371             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7372         }
7373         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7374             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7375         }
7376     }else{
7377         sps->crop_left  =
7378         sps->crop_right =
7379         sps->crop_top   =
7380         sps->crop_bottom= 0;
7381     }
7382
7383     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7384     if( sps->vui_parameters_present_flag )
7385         decode_vui_parameters(h, sps);
7386
7387     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7388         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7389                sps_id, sps->profile_idc, sps->level_idc,
7390                sps->poc_type,
7391                sps->ref_frame_count,
7392                sps->mb_width, sps->mb_height,
7393                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7394                sps->direct_8x8_inference_flag ? "8B8" : "",
7395                sps->crop_left, sps->crop_right,
7396                sps->crop_top, sps->crop_bottom,
7397                sps->vui_parameters_present_flag ? "VUI" : ""
7398                );
7399     }
7400     return 0;
7401 }
7402
7403 static void
7404 build_qp_table(PPS *pps, int t, int index)
7405 {
7406     int i;
7407     for(i = 0; i < 52; i++)
7408         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7409 }
7410
7411 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7412     MpegEncContext * const s = &h->s;
7413     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7414     PPS *pps;
7415
7416     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7417     if(pps == NULL)
7418         return -1;
7419
7420     tmp= get_ue_golomb(&s->gb);
7421     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7422         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7423         return -1;
7424     }
7425     pps->sps_id= tmp;
7426
7427     pps->cabac= get_bits1(&s->gb);
7428     pps->pic_order_present= get_bits1(&s->gb);
7429     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7430     if(pps->slice_group_count > 1 ){
7431         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7432         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7433         switch(pps->mb_slice_group_map_type){
7434         case 0:
7435 #if 0
7436 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7437 |    run_length[ i ]                                |1  |ue(v)   |
7438 #endif
7439             break;
7440         case 2:
7441 #if 0
7442 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7443 |{                                                  |   |        |
7444 |    top_left_mb[ i ]                               |1  |ue(v)   |
7445 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7446 |   }                                               |   |        |
7447 #endif
7448             break;
7449         case 3:
7450         case 4:
7451         case 5:
7452 #if 0
7453 |   slice_group_change_direction_flag               |1  |u(1)    |
7454 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7455 #endif
7456             break;
7457         case 6:
7458 #if 0
7459 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7460 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7461 |)                                                  |   |        |
7462 |    slice_group_id[ i ]                            |1  |u(v)    |
7463 #endif
7464             break;
7465         }
7466     }
7467     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7468     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7469     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7470         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7471         pps->ref_count[0]= pps->ref_count[1]= 1;
7472         return -1;
7473     }
7474
7475     pps->weighted_pred= get_bits1(&s->gb);
7476     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7477     pps->init_qp= get_se_golomb(&s->gb) + 26;
7478     pps->init_qs= get_se_golomb(&s->gb) + 26;
7479     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7480     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7481     pps->constrained_intra_pred= get_bits1(&s->gb);
7482     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7483
7484     pps->transform_8x8_mode= 0;
7485     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7486     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7487     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7488
7489     if(get_bits_count(&s->gb) < bit_length){
7490         pps->transform_8x8_mode= get_bits1(&s->gb);
7491         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7492         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7493     } else {
7494         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7495     }
7496
7497     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7498     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7499     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7500         h->pps.chroma_qp_diff= 1;
7501
7502     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7503         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7504                pps_id, pps->sps_id,
7505                pps->cabac ? "CABAC" : "CAVLC",
7506                pps->slice_group_count,
7507                pps->ref_count[0], pps->ref_count[1],
7508                pps->weighted_pred ? "weighted" : "",
7509                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7510                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7511                pps->constrained_intra_pred ? "CONSTR" : "",
7512                pps->redundant_pic_cnt_present ? "REDU" : "",
7513                pps->transform_8x8_mode ? "8x8DCT" : ""
7514                );
7515     }
7516
7517     return 0;
7518 }
7519
7520 /**
7521  * Call decode_slice() for each context.
7522  *
7523  * @param h h264 master context
7524  * @param context_count number of contexts to execute
7525  */
7526 static void execute_decode_slices(H264Context *h, int context_count){
7527     MpegEncContext * const s = &h->s;
7528     AVCodecContext * const avctx= s->avctx;
7529     H264Context *hx;
7530     int i;
7531
7532     if(context_count == 1) {
7533         decode_slice(avctx, h);
7534     } else {
7535         for(i = 1; i < context_count; i++) {
7536             hx = h->thread_context[i];
7537             hx->s.error_resilience = avctx->error_resilience;
7538             hx->s.error_count = 0;
7539         }
7540
7541         avctx->execute(avctx, (void *)decode_slice,
7542                        (void **)h->thread_context, NULL, context_count);
7543
7544         /* pull back stuff from slices to master context */
7545         hx = h->thread_context[context_count - 1];
7546         s->mb_x = hx->s.mb_x;
7547         s->mb_y = hx->s.mb_y;
7548         s->dropable = hx->s.dropable;
7549         s->picture_structure = hx->s.picture_structure;
7550         for(i = 1; i < context_count; i++)
7551             h->s.error_count += h->thread_context[i]->s.error_count;
7552     }
7553 }
7554
7555
7556 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7557     MpegEncContext * const s = &h->s;
7558     AVCodecContext * const avctx= s->avctx;
7559     int buf_index=0;
7560     H264Context *hx; ///< thread context
7561     int context_count = 0;
7562
7563     h->max_contexts = avctx->thread_count;
7564 #if 0
7565     int i;
7566     for(i=0; i<50; i++){
7567         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7568     }
7569 #endif
7570     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7571         h->current_slice = 0;
7572         if (!s->first_field)
7573             s->current_picture_ptr= NULL;
7574     }
7575
7576     for(;;){
7577         int consumed;
7578         int dst_length;
7579         int bit_length;
7580         const uint8_t *ptr;
7581         int i, nalsize = 0;
7582         int err;
7583
7584         if(h->is_avc) {
7585             if(buf_index >= buf_size) break;
7586             nalsize = 0;
7587             for(i = 0; i < h->nal_length_size; i++)
7588                 nalsize = (nalsize << 8) | buf[buf_index++];
7589             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7590                 if(nalsize == 1){
7591                     buf_index++;
7592                     continue;
7593                 }else{
7594                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7595                     break;
7596                 }
7597             }
7598         } else {
7599             // start code prefix search
7600             for(; buf_index + 3 < buf_size; buf_index++){
7601                 // This should always succeed in the first iteration.
7602                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7603                     break;
7604             }
7605
7606             if(buf_index+3 >= buf_size) break;
7607
7608             buf_index+=3;
7609         }
7610
7611         hx = h->thread_context[context_count];
7612
7613         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7614         if (ptr==NULL || dst_length < 0){
7615             return -1;
7616         }
7617         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7618             dst_length--;
7619         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7620
7621         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7622             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7623         }
7624
7625         if (h->is_avc && (nalsize != consumed)){
7626             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7627             consumed= nalsize;
7628         }
7629
7630         buf_index += consumed;
7631
7632         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7633            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7634             continue;
7635
7636       again:
7637         err = 0;
7638         switch(hx->nal_unit_type){
7639         case NAL_IDR_SLICE:
7640             if (h->nal_unit_type != NAL_IDR_SLICE) {
7641                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7642                 return -1;
7643             }
7644             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7645         case NAL_SLICE:
7646             init_get_bits(&hx->s.gb, ptr, bit_length);
7647             hx->intra_gb_ptr=
7648             hx->inter_gb_ptr= &hx->s.gb;
7649             hx->s.data_partitioning = 0;
7650
7651             if((err = decode_slice_header(hx, h)))
7652                break;
7653
7654             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7655             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7656                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7657                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7658                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7659                && avctx->skip_frame < AVDISCARD_ALL)
7660                 context_count++;
7661             break;
7662         case NAL_DPA:
7663             init_get_bits(&hx->s.gb, ptr, bit_length);
7664             hx->intra_gb_ptr=
7665             hx->inter_gb_ptr= NULL;
7666             hx->s.data_partitioning = 1;
7667
7668             err = decode_slice_header(hx, h);
7669             break;
7670         case NAL_DPB:
7671             init_get_bits(&hx->intra_gb, ptr, bit_length);
7672             hx->intra_gb_ptr= &hx->intra_gb;
7673             break;
7674         case NAL_DPC:
7675             init_get_bits(&hx->inter_gb, ptr, bit_length);
7676             hx->inter_gb_ptr= &hx->inter_gb;
7677
7678             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7679                && s->context_initialized
7680                && s->hurry_up < 5
7681                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7682                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7683                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7684                && avctx->skip_frame < AVDISCARD_ALL)
7685                 context_count++;
7686             break;
7687         case NAL_SEI:
7688             init_get_bits(&s->gb, ptr, bit_length);
7689             decode_sei(h);
7690             break;
7691         case NAL_SPS:
7692             init_get_bits(&s->gb, ptr, bit_length);
7693             decode_seq_parameter_set(h);
7694
7695             if(s->flags& CODEC_FLAG_LOW_DELAY)
7696                 s->low_delay=1;
7697
7698             if(avctx->has_b_frames < 2)
7699                 avctx->has_b_frames= !s->low_delay;
7700             break;
7701         case NAL_PPS:
7702             init_get_bits(&s->gb, ptr, bit_length);
7703
7704             decode_picture_parameter_set(h, bit_length);
7705
7706             break;
7707         case NAL_AUD:
7708         case NAL_END_SEQUENCE:
7709         case NAL_END_STREAM:
7710         case NAL_FILLER_DATA:
7711         case NAL_SPS_EXT:
7712         case NAL_AUXILIARY_SLICE:
7713             break;
7714         default:
7715             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7716         }
7717
7718         if(context_count == h->max_contexts) {
7719             execute_decode_slices(h, context_count);
7720             context_count = 0;
7721         }
7722
7723         if (err < 0)
7724             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7725         else if(err == 1) {
7726             /* Slice could not be decoded in parallel mode, copy down
7727              * NAL unit stuff to context 0 and restart. Note that
7728              * rbsp_buffer is not transferred, but since we no longer
7729              * run in parallel mode this should not be an issue. */
7730             h->nal_unit_type = hx->nal_unit_type;
7731             h->nal_ref_idc   = hx->nal_ref_idc;
7732             hx = h;
7733             goto again;
7734         }
7735     }
7736     if(context_count)
7737         execute_decode_slices(h, context_count);
7738     return buf_index;
7739 }
7740
7741 /**
7742  * returns the number of bytes consumed for building the current frame
7743  */
7744 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7745         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7746         if(pos+10>buf_size) pos=buf_size; // oops ;)
7747
7748         return pos;
7749 }
7750
7751 static int decode_frame(AVCodecContext *avctx,
7752                              void *data, int *data_size,
7753                              const uint8_t *buf, int buf_size)
7754 {
7755     H264Context *h = avctx->priv_data;
7756     MpegEncContext *s = &h->s;
7757     AVFrame *pict = data;
7758     int buf_index;
7759
7760     s->flags= avctx->flags;
7761     s->flags2= avctx->flags2;
7762
7763    /* end of stream, output what is still in the buffers */
7764     if (buf_size == 0) {
7765         Picture *out;
7766         int i, out_idx;
7767
7768 //FIXME factorize this with the output code below
7769         out = h->delayed_pic[0];
7770         out_idx = 0;
7771         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7772             if(h->delayed_pic[i]->poc < out->poc){
7773                 out = h->delayed_pic[i];
7774                 out_idx = i;
7775             }
7776
7777         for(i=out_idx; h->delayed_pic[i]; i++)
7778             h->delayed_pic[i] = h->delayed_pic[i+1];
7779
7780         if(out){
7781             *data_size = sizeof(AVFrame);
7782             *pict= *(AVFrame*)out;
7783         }
7784
7785         return 0;
7786     }
7787
7788     if(h->is_avc && !h->got_avcC) {
7789         int i, cnt, nalsize;
7790         unsigned char *p = avctx->extradata;
7791         if(avctx->extradata_size < 7) {
7792             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7793             return -1;
7794         }
7795         if(*p != 1) {
7796             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7797             return -1;
7798         }
7799         /* sps and pps in the avcC always have length coded with 2 bytes,
7800            so put a fake nal_length_size = 2 while parsing them */
7801         h->nal_length_size = 2;
7802         // Decode sps from avcC
7803         cnt = *(p+5) & 0x1f; // Number of sps
7804         p += 6;
7805         for (i = 0; i < cnt; i++) {
7806             nalsize = AV_RB16(p) + 2;
7807             if(decode_nal_units(h, p, nalsize) < 0) {
7808                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7809                 return -1;
7810             }
7811             p += nalsize;
7812         }
7813         // Decode pps from avcC
7814         cnt = *(p++); // Number of pps
7815         for (i = 0; i < cnt; i++) {
7816             nalsize = AV_RB16(p) + 2;
7817             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7818                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7819                 return -1;
7820             }
7821             p += nalsize;
7822         }
7823         // Now store right nal length size, that will be use to parse all other nals
7824         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7825         // Do not reparse avcC
7826         h->got_avcC = 1;
7827     }
7828
7829     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7830         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7831             return -1;
7832     }
7833
7834     buf_index=decode_nal_units(h, buf, buf_size);
7835     if(buf_index < 0)
7836         return -1;
7837
7838     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7839         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7840         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7841         return -1;
7842     }
7843
7844     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7845         Picture *out = s->current_picture_ptr;
7846         Picture *cur = s->current_picture_ptr;
7847         int i, pics, cross_idr, out_of_order, out_idx;
7848
7849         s->mb_y= 0;
7850
7851         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7852         s->current_picture_ptr->pict_type= s->pict_type;
7853
7854         if(!s->dropable) {
7855             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7856             h->prev_poc_msb= h->poc_msb;
7857             h->prev_poc_lsb= h->poc_lsb;
7858         }
7859         h->prev_frame_num_offset= h->frame_num_offset;
7860         h->prev_frame_num= h->frame_num;
7861
7862         /*
7863          * FIXME: Error handling code does not seem to support interlaced
7864          * when slices span multiple rows
7865          * The ff_er_add_slice calls don't work right for bottom
7866          * fields; they cause massive erroneous error concealing
7867          * Error marking covers both fields (top and bottom).
7868          * This causes a mismatched s->error_count
7869          * and a bad error table. Further, the error count goes to
7870          * INT_MAX when called for bottom field, because mb_y is
7871          * past end by one (callers fault) and resync_mb_y != 0
7872          * causes problems for the first MB line, too.
7873          */
7874         if (!FIELD_PICTURE)
7875             ff_er_frame_end(s);
7876
7877         MPV_frame_end(s);
7878
7879         if (s->first_field) {
7880             /* Wait for second field. */
7881             *data_size = 0;
7882
7883         } else {
7884             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7885             /* Derive top_field_first from field pocs. */
7886             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7887
7888         //FIXME do something with unavailable reference frames
7889
7890             /* Sort B-frames into display order */
7891
7892             if(h->sps.bitstream_restriction_flag
7893                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7894                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7895                 s->low_delay = 0;
7896             }
7897
7898             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7899                && !h->sps.bitstream_restriction_flag){
7900                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7901                 s->low_delay= 0;
7902             }
7903
7904             pics = 0;
7905             while(h->delayed_pic[pics]) pics++;
7906
7907             assert(pics <= MAX_DELAYED_PIC_COUNT);
7908
7909             h->delayed_pic[pics++] = cur;
7910             if(cur->reference == 0)
7911                 cur->reference = DELAYED_PIC_REF;
7912
7913             out = h->delayed_pic[0];
7914             out_idx = 0;
7915             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7916                 if(h->delayed_pic[i]->poc < out->poc){
7917                     out = h->delayed_pic[i];
7918                     out_idx = i;
7919                 }
7920             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7921
7922             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7923
7924             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7925                 { }
7926             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7927                || (s->low_delay &&
7928                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7929                  || cur->pict_type == FF_B_TYPE)))
7930             {
7931                 s->low_delay = 0;
7932                 s->avctx->has_b_frames++;
7933             }
7934
7935             if(out_of_order || pics > s->avctx->has_b_frames){
7936                 out->reference &= ~DELAYED_PIC_REF;
7937                 for(i=out_idx; h->delayed_pic[i]; i++)
7938                     h->delayed_pic[i] = h->delayed_pic[i+1];
7939             }
7940             if(!out_of_order && pics > s->avctx->has_b_frames){
7941                 *data_size = sizeof(AVFrame);
7942
7943                 h->outputed_poc = out->poc;
7944                 *pict= *(AVFrame*)out;
7945             }else{
7946                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7947             }
7948         }
7949     }
7950
7951     assert(pict->data[0] || !*data_size);
7952     ff_print_debug_info(s, pict);
7953 //printf("out %d\n", (int)pict->data[0]);
7954 #if 0 //?
7955
7956     /* Return the Picture timestamp as the frame number */
7957     /* we subtract 1 because it is added on utils.c     */
7958     avctx->frame_number = s->picture_number - 1;
7959 #endif
7960     return get_consumed_bytes(s, buf_index, buf_size);
7961 }
7962 #if 0
7963 static inline void fill_mb_avail(H264Context *h){
7964     MpegEncContext * const s = &h->s;
7965     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7966
7967     if(s->mb_y){
7968         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7969         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7970         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7971     }else{
7972         h->mb_avail[0]=
7973         h->mb_avail[1]=
7974         h->mb_avail[2]= 0;
7975     }
7976     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7977     h->mb_avail[4]= 1; //FIXME move out
7978     h->mb_avail[5]= 0; //FIXME move out
7979 }
7980 #endif
7981
7982 #ifdef TEST
7983 #undef printf
7984 #undef random
7985 #define COUNT 8000
7986 #define SIZE (COUNT*40)
7987 int main(void){
7988     int i;
7989     uint8_t temp[SIZE];
7990     PutBitContext pb;
7991     GetBitContext gb;
7992 //    int int_temp[10000];
7993     DSPContext dsp;
7994     AVCodecContext avctx;
7995
7996     dsputil_init(&dsp, &avctx);
7997
7998     init_put_bits(&pb, temp, SIZE);
7999     printf("testing unsigned exp golomb\n");
8000     for(i=0; i<COUNT; i++){
8001         START_TIMER
8002         set_ue_golomb(&pb, i);
8003         STOP_TIMER("set_ue_golomb");
8004     }
8005     flush_put_bits(&pb);
8006
8007     init_get_bits(&gb, temp, 8*SIZE);
8008     for(i=0; i<COUNT; i++){
8009         int j, s;
8010
8011         s= show_bits(&gb, 24);
8012
8013         START_TIMER
8014         j= get_ue_golomb(&gb);
8015         if(j != i){
8016             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8017 //            return -1;
8018         }
8019         STOP_TIMER("get_ue_golomb");
8020     }
8021
8022
8023     init_put_bits(&pb, temp, SIZE);
8024     printf("testing signed exp golomb\n");
8025     for(i=0; i<COUNT; i++){
8026         START_TIMER
8027         set_se_golomb(&pb, i - COUNT/2);
8028         STOP_TIMER("set_se_golomb");
8029     }
8030     flush_put_bits(&pb);
8031
8032     init_get_bits(&gb, temp, 8*SIZE);
8033     for(i=0; i<COUNT; i++){
8034         int j, s;
8035
8036         s= show_bits(&gb, 24);
8037
8038         START_TIMER
8039         j= get_se_golomb(&gb);
8040         if(j != i - COUNT/2){
8041             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8042 //            return -1;
8043         }
8044         STOP_TIMER("get_se_golomb");
8045     }
8046
8047 #if 0
8048     printf("testing 4x4 (I)DCT\n");
8049
8050     DCTELEM block[16];
8051     uint8_t src[16], ref[16];
8052     uint64_t error= 0, max_error=0;
8053
8054     for(i=0; i<COUNT; i++){
8055         int j;
8056 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8057         for(j=0; j<16; j++){
8058             ref[j]= random()%255;
8059             src[j]= random()%255;
8060         }
8061
8062         h264_diff_dct_c(block, src, ref, 4);
8063
8064         //normalize
8065         for(j=0; j<16; j++){
8066 //            printf("%d ", block[j]);
8067             block[j]= block[j]*4;
8068             if(j&1) block[j]= (block[j]*4 + 2)/5;
8069             if(j&4) block[j]= (block[j]*4 + 2)/5;
8070         }
8071 //        printf("\n");
8072
8073         s->dsp.h264_idct_add(ref, block, 4);
8074 /*        for(j=0; j<16; j++){
8075             printf("%d ", ref[j]);
8076         }
8077         printf("\n");*/
8078
8079         for(j=0; j<16; j++){
8080             int diff= FFABS(src[j] - ref[j]);
8081
8082             error+= diff*diff;
8083             max_error= FFMAX(max_error, diff);
8084         }
8085     }
8086     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8087     printf("testing quantizer\n");
8088     for(qp=0; qp<52; qp++){
8089         for(i=0; i<16; i++)
8090             src1_block[i]= src2_block[i]= random()%255;
8091
8092     }
8093     printf("Testing NAL layer\n");
8094
8095     uint8_t bitstream[COUNT];
8096     uint8_t nal[COUNT*2];
8097     H264Context h;
8098     memset(&h, 0, sizeof(H264Context));
8099
8100     for(i=0; i<COUNT; i++){
8101         int zeros= i;
8102         int nal_length;
8103         int consumed;
8104         int out_length;
8105         uint8_t *out;
8106         int j;
8107
8108         for(j=0; j<COUNT; j++){
8109             bitstream[j]= (random() % 255) + 1;
8110         }
8111
8112         for(j=0; j<zeros; j++){
8113             int pos= random() % COUNT;
8114             while(bitstream[pos] == 0){
8115                 pos++;
8116                 pos %= COUNT;
8117             }
8118             bitstream[pos]=0;
8119         }
8120
8121         START_TIMER
8122
8123         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8124         if(nal_length<0){
8125             printf("encoding failed\n");
8126             return -1;
8127         }
8128
8129         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8130
8131         STOP_TIMER("NAL")
8132
8133         if(out_length != COUNT){
8134             printf("incorrect length %d %d\n", out_length, COUNT);
8135             return -1;
8136         }
8137
8138         if(consumed != nal_length){
8139             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8140             return -1;
8141         }
8142
8143         if(memcmp(bitstream, out, COUNT)){
8144             printf("mismatch\n");
8145             return -1;
8146         }
8147     }
8148 #endif
8149
8150     printf("Testing RBSP\n");
8151
8152
8153     return 0;
8154 }
8155 #endif /* TEST */
8156
8157
8158 static av_cold int decode_end(AVCodecContext *avctx)
8159 {
8160     H264Context *h = avctx->priv_data;
8161     MpegEncContext *s = &h->s;
8162
8163     av_freep(&h->rbsp_buffer[0]);
8164     av_freep(&h->rbsp_buffer[1]);
8165     free_tables(h); //FIXME cleanup init stuff perhaps
8166     MPV_common_end(s);
8167
8168 //    memset(h, 0, sizeof(H264Context));
8169
8170     return 0;
8171 }
8172
8173
8174 AVCodec h264_decoder = {
8175     "h264",
8176     CODEC_TYPE_VIDEO,
8177     CODEC_ID_H264,
8178     sizeof(H264Context),
8179     decode_init,
8180     NULL,
8181     decode_end,
8182     decode_frame,
8183     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8184     .flush= flush_dpb,
8185     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8186 };
8187
8188 #include "svq3.c"