git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= h->mb_xy;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type_nos == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     const int mb_xy= h->mb_xy;
 554
 555     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 556     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 557     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 558     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 559     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 560     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 561     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 562 }
 563
 564 /**
 565  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 566  */
 567 static inline int check_intra4x4_pred_mode(H264Context *h){
 568     MpegEncContext * const s = &h->s;
 569     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 570     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 571     int i;
 572
 573     if(!(h->top_samples_available&0x8000)){
 574         for(i=0; i<4; i++){
 575             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 576             if(status<0){
 577                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 578                 return -1;
 579             } else if(status){
 580                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 581             }
 582         }
 583     }
 584
 585     if(!(h->left_samples_available&0x8000)){
 586         for(i=0; i<4; i++){
 587             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 588             if(status<0){
 589                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 590                 return -1;
 591             } else if(status){
 592                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if(!(h->left_samples_available&0x8000)){
 622         mode= left[ mode ];
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665
 666     if(FRAME_MBAFF){
 667         // store all luma nnzs, for deblocking
 668         int v = 0, i;
 669         for(i=0; i<16; i++)
 670             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 671         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 672     }
 673 }
 674
 675 /**
 676  * gets the predicted number of non zero coefficients.
 677  * @param n block index
 678  */
 679 static inline int pred_non_zero_count(H264Context *h, int n){
 680     const int index8= scan8[n];
 681     const int left= h->non_zero_count_cache[index8 - 1];
 682     const int top = h->non_zero_count_cache[index8 - 8];
 683     int i= left + top;
 684
 685     if(i<64) i= (i+1)>>1;
 686
 687     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 688
 689     return i&31;
 690 }
 691
 692 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 693     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 694     MpegEncContext *s = &h->s;
 695
 696     /* there is no consistent mapping of mvs to neighboring locations that will
 697      * make mbaff happy, so we can't move all this logic to fill_caches */
 698     if(FRAME_MBAFF){
 699         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 700         const int16_t *mv;
 701         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 702         *C = h->mv_cache[list][scan8[0]-2];
 703
 704         if(!MB_FIELD
 705            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 706             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 707             if(IS_INTERLACED(mb_types[topright_xy])){
 708 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 709                 const int x4 = X4, y4 = Y4;\
 710                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 711                 if(!USES_LIST(mb_type,list))\
 712                     return LIST_NOT_USED;\
 713                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 714                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 715                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 716                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 717
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 719             }
 720         }
 721         if(topright_ref == PART_NOT_AVAILABLE
 722            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 723            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 724             if(!MB_FIELD
 725                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 726                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 727             }
 728             if(MB_FIELD
 729                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 730                && i >= scan8[0]+8){
 731                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 732                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 733             }
 734         }
 735 #undef SET_DIAG_MV
 736     }
 737
 738     if(topright_ref != PART_NOT_AVAILABLE){
 739         *C= h->mv_cache[list][ i - 8 + part_width ];
 740         return topright_ref;
 741     }else{
 742         tprintf(s->avctx, "topright MV not available\n");
 743
 744         *C= h->mv_cache[list][ i - 8 - 1 ];
 745         return h->ref_cache[list][ i - 8 - 1 ];
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted MV.
 751  * @param n the block index
 752  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 753  * @param mx the x component of the predicted motion vector
 754  * @param my the y component of the predicted motion vector
 755  */
 756 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 757     const int index8= scan8[n];
 758     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 759     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 760     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 761     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 762     const int16_t * C;
 763     int diagonal_ref, match_count;
 764
 765     assert(part_width==1 || part_width==2 || part_width==4);
 766
 767 /* mv_cache
 768   B . . A T T T T
 769   U . . L . . , .
 770   U . . L . . . .
 771   U . . L . . , .
 772   . . . L . . . .
 773 */
 774
 775     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 776     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 777     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 778     if(match_count > 1){ //most common
 779         *mx= mid_pred(A[0], B[0], C[0]);
 780         *my= mid_pred(A[1], B[1], C[1]);
 781     }else if(match_count==1){
 782         if(left_ref==ref){
 783             *mx= A[0];
 784             *my= A[1];
 785         }else if(top_ref==ref){
 786             *mx= B[0];
 787             *my= B[1];
 788         }else{
 789             *mx= C[0];
 790             *my= C[1];
 791         }
 792     }else{
 793         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 794             *mx= A[0];
 795             *my= A[1];
 796         }else{
 797             *mx= mid_pred(A[0], B[0], C[0]);
 798             *my= mid_pred(A[1], B[1], C[1]);
 799         }
 800     }
 801
 802     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 803 }
 804
 805 /**
 806  * gets the directionally predicted 16x8 MV.
 807  * @param n the block index
 808  * @param mx the x component of the predicted motion vector
 809  * @param my the y component of the predicted motion vector
 810  */
 811 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 812     if(n==0){
 813         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 814         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 815
 816         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 817
 818         if(top_ref == ref){
 819             *mx= B[0];
 820             *my= B[1];
 821             return;
 822         }
 823     }else{
 824         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 825         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 826
 827         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 828
 829         if(left_ref == ref){
 830             *mx= A[0];
 831             *my= A[1];
 832             return;
 833         }
 834     }
 835
 836     //RARE
 837     pred_motion(h, n, 4, list, ref, mx, my);
 838 }
 839
 840 /**
 841  * gets the directionally predicted 8x16 MV.
 842  * @param n the block index
 843  * @param mx the x component of the predicted motion vector
 844  * @param my the y component of the predicted motion vector
 845  */
 846 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 847     if(n==0){
 848         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 849         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 850
 851         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 852
 853         if(left_ref == ref){
 854             *mx= A[0];
 855             *my= A[1];
 856             return;
 857         }
 858     }else{
 859         const int16_t * C;
 860         int diagonal_ref;
 861
 862         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 863
 864         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 865
 866         if(diagonal_ref == ref){
 867             *mx= C[0];
 868             *my= C[1];
 869             return;
 870         }
 871     }
 872
 873     //RARE
 874     pred_motion(h, n, 2, list, ref, mx, my);
 875 }
 876
 877 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 878     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 879     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 880
 881     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 882
 883     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 884        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 885        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 886
 887         *mx = *my = 0;
 888         return;
 889     }
 890
 891     pred_motion(h, 0, 4, 0, 0, mx, my);
 892
 893     return;
 894 }
 895
 896 static inline void direct_dist_scale_factor(H264Context * const h){
 897     const int poc = h->s.current_picture_ptr->poc;
 898     const int poc1 = h->ref_list[1][0].poc;
 899     int i;
 900     for(i=0; i<h->ref_count[0]; i++){
 901         int poc0 = h->ref_list[0][i].poc;
 902         int td = av_clip(poc1 - poc0, -128, 127);
 903         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 904             h->dist_scale_factor[i] = 256;
 905         }else{
 906             int tb = av_clip(poc - poc0, -128, 127);
 907             int tx = (16384 + (FFABS(td) >> 1)) / td;
 908             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 909         }
 910     }
 911     if(FRAME_MBAFF){
 912         for(i=0; i<h->ref_count[0]; i++){
 913             h->dist_scale_factor_field[2*i] =
 914             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 915         }
 916     }
 917 }
 918 static inline void direct_ref_list_init(H264Context * const h){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     Picture * const cur = s->current_picture_ptr;
 922     int list, i, j;
 923     if(cur->pict_type == FF_I_TYPE)
 924         cur->ref_count[0] = 0;
 925     if(cur->pict_type != FF_B_TYPE)
 926         cur->ref_count[1] = 0;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 931     }
 932     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 933         return;
 934     for(list=0; list<2; list++){
 935         for(i=0; i<ref1->ref_count[list]; i++){
 936             const int poc = ref1->ref_poc[list][i];
 937             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 938             for(j=0; j<h->ref_count[list]; j++)
 939                 if(h->ref_list[list][j].poc == poc){
 940                     h->map_col_to_list0[list][i] = j;
 941                     break;
 942                 }
 943         }
 944     }
 945     if(FRAME_MBAFF){
 946         for(list=0; list<2; list++){
 947             for(i=0; i<ref1->ref_count[list]; i++){
 948                 j = h->map_col_to_list0[list][i];
 949                 h->map_col_to_list0_field[list][2*i] = 2*j;
 950                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 951             }
 952         }
 953     }
 954 }
 955
 956 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 957     MpegEncContext * const s = &h->s;
 958     const int mb_xy =   h->mb_xy;
 959     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 960     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 961     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 962     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 963     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 964     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 965     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 966     const int is_b8x8 = IS_8X8(*mb_type);
 967     unsigned int sub_mb_type;
 968     int i8, i4;
 969
 970 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 971     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 972         /* FIXME save sub mb types from previous frames (or derive from MVs)
 973          * so we know exactly what block size to use */
 974         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 975         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 976     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 977         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 978         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 979     }else{
 980         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 981         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 982     }
 983     if(!is_b8x8)
 984         *mb_type |= MB_TYPE_DIRECT2;
 985     if(MB_FIELD)
 986         *mb_type |= MB_TYPE_INTERLACED;
 987
 988     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 989
 990     if(h->direct_spatial_mv_pred){
 991         int ref[2];
 992         int mv[2][2];
 993         int list;
 994
 995         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 996
 997         /* ref = min(neighbors) */
 998         for(list=0; list<2; list++){
 999             int refa = h->ref_cache[list][scan8[0] - 1];
1000             int refb = h->ref_cache[list][scan8[0] - 8];
1001             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1002             if(refc == -2)
1003                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1004             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1005             if(ref[list] < 0)
1006                 ref[list] = -1;
1007         }
1008
1009         if(ref[0] < 0 && ref[1] < 0){
1010             ref[0] = ref[1] = 0;
1011             mv[0][0] = mv[0][1] =
1012             mv[1][0] = mv[1][1] = 0;
1013         }else{
1014             for(list=0; list<2; list++){
1015                 if(ref[list] >= 0)
1016                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1017                 else
1018                     mv[list][0] = mv[list][1] = 0;
1019             }
1020         }
1021
1022         if(ref[1] < 0){
1023             if(!is_b8x8)
1024                 *mb_type &= ~MB_TYPE_L1;
1025             sub_mb_type &= ~MB_TYPE_L1;
1026         }else if(ref[0] < 0){
1027             if(!is_b8x8)
1028                 *mb_type &= ~MB_TYPE_L0;
1029             sub_mb_type &= ~MB_TYPE_L0;
1030         }
1031
1032         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1033             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1034             int mb_types_col[2];
1035             int b8_stride = h->b8_stride;
1036             int b4_stride = h->b_stride;
1037
1038             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1039
1040             if(IS_INTERLACED(*mb_type)){
1041                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1042                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1043                 if(s->mb_y&1){
1044                     l1ref0 -= 2*b8_stride;
1045                     l1ref1 -= 2*b8_stride;
1046                     l1mv0 -= 4*b4_stride;
1047                     l1mv1 -= 4*b4_stride;
1048                 }
1049                 b8_stride *= 3;
1050                 b4_stride *= 6;
1051             }else{
1052                 int cur_poc = s->current_picture_ptr->poc;
1053                 int *col_poc = h->ref_list[1]->field_poc;
1054                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1055                 int dy = 2*col_parity - (s->mb_y&1);
1056                 mb_types_col[0] =
1057                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1058                 l1ref0 += dy*b8_stride;
1059                 l1ref1 += dy*b8_stride;
1060                 l1mv0 += 2*dy*b4_stride;
1061                 l1mv1 += 2*dy*b4_stride;
1062                 b8_stride = 0;
1063             }
1064
1065             for(i8=0; i8<4; i8++){
1066                 int x8 = i8&1;
1067                 int y8 = i8>>1;
1068                 int xy8 = x8+y8*b8_stride;
1069                 int xy4 = 3*x8+y8*b4_stride;
1070                 int a=0, b=0;
1071
1072                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1073                     continue;
1074                 h->sub_mb_type[i8] = sub_mb_type;
1075
1076                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1077                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1078                 if(!IS_INTRA(mb_types_col[y8])
1079                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1080                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1081                     if(ref[0] > 0)
1082                         a= pack16to32(mv[0][0],mv[0][1]);
1083                     if(ref[1] > 0)
1084                         b= pack16to32(mv[1][0],mv[1][1]);
1085                 }else{
1086                     a= pack16to32(mv[0][0],mv[0][1]);
1087                     b= pack16to32(mv[1][0],mv[1][1]);
1088                 }
1089                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1090                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1091             }
1092         }else if(IS_16X16(*mb_type)){
1093             int a=0, b=0;
1094
1095             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1096             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1097             if(!IS_INTRA(mb_type_col)
1098                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1099                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1100                        && (h->x264_build>33 || !h->x264_build)))){
1101                 if(ref[0] > 0)
1102                     a= pack16to32(mv[0][0],mv[0][1]);
1103                 if(ref[1] > 0)
1104                     b= pack16to32(mv[1][0],mv[1][1]);
1105             }else{
1106                 a= pack16to32(mv[0][0],mv[0][1]);
1107                 b= pack16to32(mv[1][0],mv[1][1]);
1108             }
1109             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1110             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1111         }else{
1112             for(i8=0; i8<4; i8++){
1113                 const int x8 = i8&1;
1114                 const int y8 = i8>>1;
1115
1116                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1117                     continue;
1118                 h->sub_mb_type[i8] = sub_mb_type;
1119
1120                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1121                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1122                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1123                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1124
1125                 /* col_zero_flag */
1126                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1127                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1128                                                   && (h->x264_build>33 || !h->x264_build)))){
1129                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1130                     if(IS_SUB_8X8(sub_mb_type)){
1131                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1132                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1133                             if(ref[0] == 0)
1134                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1135                             if(ref[1] == 0)
1136                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1137                         }
1138                     }else
1139                     for(i4=0; i4<4; i4++){
1140                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1141                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1142                             if(ref[0] == 0)
1143                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1144                             if(ref[1] == 0)
1145                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1146                         }
1147                     }
1148                 }
1149             }
1150         }
1151     }else{ /* direct temporal mv pred */
1152         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1153         const int *dist_scale_factor = h->dist_scale_factor;
1154
1155         if(FRAME_MBAFF){
1156             if(IS_INTERLACED(*mb_type)){
1157                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1158                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1159                 dist_scale_factor = h->dist_scale_factor_field;
1160             }
1161             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1162                 /* FIXME assumes direct_8x8_inference == 1 */
1163                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1164                 int mb_types_col[2];
1165                 int y_shift;
1166
1167                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1168                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1169                          | (*mb_type & MB_TYPE_INTERLACED);
1170                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1171
1172                 if(IS_INTERLACED(*mb_type)){
1173                     /* frame to field scaling */
1174                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1175                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1176                     if(s->mb_y&1){
1177                         l1ref0 -= 2*h->b8_stride;
1178                         l1ref1 -= 2*h->b8_stride;
1179                         l1mv0 -= 4*h->b_stride;
1180                         l1mv1 -= 4*h->b_stride;
1181                     }
1182                     y_shift = 0;
1183
1184                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1185                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1186                        && !is_b8x8)
1187                         *mb_type |= MB_TYPE_16x8;
1188                     else
1189                         *mb_type |= MB_TYPE_8x8;
1190                 }else{
1191                     /* field to frame scaling */
1192                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1193                      * but in MBAFF, top and bottom POC are equal */
1194                     int dy = (s->mb_y&1) ? 1 : 2;
1195                     mb_types_col[0] =
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     l1ref0 += dy*h->b8_stride;
1198                     l1ref1 += dy*h->b8_stride;
1199                     l1mv0 += 2*dy*h->b_stride;
1200                     l1mv1 += 2*dy*h->b_stride;
1201                     y_shift = 2;
1202
1203                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1204                        && !is_b8x8)
1205                         *mb_type |= MB_TYPE_16x16;
1206                     else
1207                         *mb_type |= MB_TYPE_8x8;
1208                 }
1209
1210                 for(i8=0; i8<4; i8++){
1211                     const int x8 = i8&1;
1212                     const int y8 = i8>>1;
1213                     int ref0, scale;
1214                     const int16_t (*l1mv)[2]= l1mv0;
1215
1216                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1217                         continue;
1218                     h->sub_mb_type[i8] = sub_mb_type;
1219
1220                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1221                     if(IS_INTRA(mb_types_col[y8])){
1222                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1223                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1224                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                         continue;
1226                     }
1227
1228                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1229                     if(ref0 >= 0)
1230                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1231                     else{
1232                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1233                         l1mv= l1mv1;
1234                     }
1235                     scale = dist_scale_factor[ref0];
1236                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1237
1238                     {
1239                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1240                         int my_col = (mv_col[1]<<y_shift)/2;
1241                         int mx = (scale * mv_col[0] + 128) >> 8;
1242                         int my = (scale * my_col + 128) >> 8;
1243                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1244                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1245                     }
1246                 }
1247                 return;
1248             }
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col)){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1261                                                 : map_col_to_list0[1][l1ref1[0]];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col)){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * idct tranforms the 16 dc values and dequantize them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * dct tranforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         done = 1;
1958
1959         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1960                  &chroma_dc_coeff_token_len [0], 1, 1,
1961                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1962
1963         for(i=0; i<4; i++){
1964             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1965                      &coeff_token_len [i][0], 1, 1,
1966                      &coeff_token_bits[i][0], 1, 1, 1);
1967         }
1968
1969         for(i=0; i<3; i++){
1970             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1971                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1972                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1973         }
1974         for(i=0; i<15; i++){
1975             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1976                      &total_zeros_len [i][0], 1, 1,
1977                      &total_zeros_bits[i][0], 1, 1, 1);
1978         }
1979
1980         for(i=0; i<6; i++){
1981             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1982                      &run_len [i][0], 1, 1,
1983                      &run_bits[i][0], 1, 1, 1);
1984         }
1985         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1986                  &run_len [6][0], 1, 1,
1987                  &run_bits[6][0], 1, 1, 1);
1988     }
1989 }
1990
1991 static void free_tables(H264Context *h){
1992     int i;
1993     H264Context *hx;
1994     av_freep(&h->intra4x4_pred_mode);
1995     av_freep(&h->chroma_pred_mode_table);
1996     av_freep(&h->cbp_table);
1997     av_freep(&h->mvd_table[0]);
1998     av_freep(&h->mvd_table[1]);
1999     av_freep(&h->direct_table);
2000     av_freep(&h->non_zero_count);
2001     av_freep(&h->slice_table_base);
2002     h->slice_table= NULL;
2003
2004     av_freep(&h->mb2b_xy);
2005     av_freep(&h->mb2b8_xy);
2006
2007     for(i = 0; i < MAX_SPS_COUNT; i++)
2008         av_freep(h->sps_buffers + i);
2009
2010     for(i = 0; i < MAX_PPS_COUNT; i++)
2011         av_freep(h->pps_buffers + i);
2012
2013     for(i = 0; i < h->s.avctx->thread_count; i++) {
2014         hx = h->thread_context[i];
2015         if(!hx) continue;
2016         av_freep(&hx->top_borders[1]);
2017         av_freep(&hx->top_borders[0]);
2018         av_freep(&hx->s.obmc_scratchpad);
2019     }
2020 }
2021
2022 static void init_dequant8_coeff_table(H264Context *h){
2023     int i,q,x;
2024     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2025     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2026     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2027
2028     for(i=0; i<2; i++ ){
2029         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2030             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2031             break;
2032         }
2033
2034         for(q=0; q<52; q++){
2035             int shift = ff_div6[q];
2036             int idx = ff_rem6[q];
2037             for(x=0; x<64; x++)
2038                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2039                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2040                     h->pps.scaling_matrix8[i][x]) << shift;
2041         }
2042     }
2043 }
2044
2045 static void init_dequant4_coeff_table(H264Context *h){
2046     int i,j,q,x;
2047     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2048     for(i=0; i<6; i++ ){
2049         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2050         for(j=0; j<i; j++){
2051             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2052                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2053                 break;
2054             }
2055         }
2056         if(j<i)
2057             continue;
2058
2059         for(q=0; q<52; q++){
2060             int shift = ff_div6[q] + 2;
2061             int idx = ff_rem6[q];
2062             for(x=0; x<16; x++)
2063                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2064                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2065                     h->pps.scaling_matrix4[i][x]) << shift;
2066         }
2067     }
2068 }
2069
2070 static void init_dequant_tables(H264Context *h){
2071     int i,x;
2072     init_dequant4_coeff_table(h);
2073     if(h->pps.transform_8x8_mode)
2074         init_dequant8_coeff_table(h);
2075     if(h->sps.transform_bypass){
2076         for(i=0; i<6; i++)
2077             for(x=0; x<16; x++)
2078                 h->dequant4_coeff[i][0][x] = 1<<6;
2079         if(h->pps.transform_8x8_mode)
2080             for(i=0; i<2; i++)
2081                 for(x=0; x<64; x++)
2082                     h->dequant8_coeff[i][0][x] = 1<<6;
2083     }
2084 }
2085
2086
2087 /**
2088  * allocates tables.
2089  * needs width/height
2090  */
2091 static int alloc_tables(H264Context *h){
2092     MpegEncContext * const s = &h->s;
2093     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2094     int x,y;
2095
2096     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2097
2098     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2099     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2101
2102     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2104     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2106
2107     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2108     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2109
2110     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2111     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2112     for(y=0; y<s->mb_height; y++){
2113         for(x=0; x<s->mb_width; x++){
2114             const int mb_xy= x + y*s->mb_stride;
2115             const int b_xy = 4*x + 4*y*h->b_stride;
2116             const int b8_xy= 2*x + 2*y*h->b8_stride;
2117
2118             h->mb2b_xy [mb_xy]= b_xy;
2119             h->mb2b8_xy[mb_xy]= b8_xy;
2120         }
2121     }
2122
2123     s->obmc_scratchpad = NULL;
2124
2125     if(!h->dequant4_coeff[0])
2126         init_dequant_tables(h);
2127
2128     return 0;
2129 fail:
2130     free_tables(h);
2131     return -1;
2132 }
2133
2134 /**
2135  * Mimic alloc_tables(), but for every context thread.
2136  */
2137 static void clone_tables(H264Context *dst, H264Context *src){
2138     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2139     dst->non_zero_count           = src->non_zero_count;
2140     dst->slice_table              = src->slice_table;
2141     dst->cbp_table                = src->cbp_table;
2142     dst->mb2b_xy                  = src->mb2b_xy;
2143     dst->mb2b8_xy                 = src->mb2b8_xy;
2144     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2145     dst->mvd_table[0]             = src->mvd_table[0];
2146     dst->mvd_table[1]             = src->mvd_table[1];
2147     dst->direct_table             = src->direct_table;
2148
2149     dst->s.obmc_scratchpad = NULL;
2150     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2151 }
2152
2153 /**
2154  * Init context
2155  * Allocate buffers which are not shared amongst multiple threads.
2156  */
2157 static int context_init(H264Context *h){
2158     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2159     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160
2161     return 0;
2162 fail:
2163     return -1; // free_tables will clean up for us
2164 }
2165
2166 static av_cold void common_init(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168
2169     s->width = s->avctx->width;
2170     s->height = s->avctx->height;
2171     s->codec_id= s->avctx->codec->id;
2172
2173     ff_h264_pred_init(&h->hpc, s->codec_id);
2174
2175     h->dequant_coeff_pps= -1;
2176     s->unrestricted_mv=1;
2177     s->decode=1; //FIXME
2178
2179     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2180     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2181 }
2182
2183 static av_cold int decode_init(AVCodecContext *avctx){
2184     H264Context *h= avctx->priv_data;
2185     MpegEncContext * const s = &h->s;
2186
2187     MPV_decode_defaults(s);
2188
2189     s->avctx = avctx;
2190     common_init(h);
2191
2192     s->out_format = FMT_H264;
2193     s->workaround_bugs= avctx->workaround_bugs;
2194
2195     // set defaults
2196 //    s->decode_mb= ff_h263_decode_mb;
2197     s->quarter_sample = 1;
2198     s->low_delay= 1;
2199
2200     if(avctx->codec_id == CODEC_ID_SVQ3)
2201         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2202     else
2203         avctx->pix_fmt= PIX_FMT_YUV420P;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     return 0;
2217 }
2218
2219 static int frame_start(H264Context *h){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222
2223     if(MPV_frame_start(s, s->avctx) < 0)
2224         return -1;
2225     ff_er_frame_start(s);
2226     /*
2227      * MPV_frame_start uses pict_type to derive key_frame.
2228      * This is incorrect for H.264; IDR markings must be used.
2229      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2230      * See decode_nal_units().
2231      */
2232     s->current_picture_ptr->key_frame= 0;
2233
2234     assert(s->linesize && s->uvlinesize);
2235
2236     for(i=0; i<16; i++){
2237         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2239     }
2240     for(i=0; i<4; i++){
2241         h->block_offset[16+i]=
2242         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243         h->block_offset[24+16+i]=
2244         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245     }
2246
2247     /* can't be in alloc_tables because linesize isn't known there.
2248      * FIXME: redo bipred weight to not require extra buffer? */
2249     for(i = 0; i < s->avctx->thread_count; i++)
2250         if(!h->thread_context[i]->s.obmc_scratchpad)
2251             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2252
2253     /* some macroblocks will be accessed before they're available */
2254     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2255         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2256
2257 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2258
2259     // We mark the current picture as non reference after allocating it, so
2260     // that if we break out due to an error it can be released automatically
2261     // in the next MPV_frame_start().
2262     // SVQ3 as well as most other codecs have only last/next/current and thus
2263     // get released even with set reference, besides SVQ3 and others do not
2264     // mark frames as reference later "naturally".
2265     if(s->codec_id != CODEC_ID_SVQ3)
2266         s->current_picture_ptr->reference= 0;
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273
2274     src_y  -=   linesize;
2275     src_cb -= uvlinesize;
2276     src_cr -= uvlinesize;
2277
2278     // There are two lines saved, the line above the the top macroblock of a pair,
2279     // and the line above the bottom macroblock
2280     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2281     for(i=1; i<17; i++){
2282         h->left_border[i]= src_y[15+i*  linesize];
2283     }
2284
2285     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2286     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2287
2288     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2289         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2290         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2291         for(i=1; i<9; i++){
2292             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2293             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2294         }
2295         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2296         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2297     }
2298 }
2299
2300 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2301     MpegEncContext * const s = &h->s;
2302     int temp8, i;
2303     uint64_t temp64;
2304     int deblock_left;
2305     int deblock_top;
2306     int mb_xy;
2307
2308     if(h->deblocking_filter == 2) {
2309         mb_xy = h->mb_xy;
2310         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2311         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2312     } else {
2313         deblock_left = (s->mb_x > 0);
2314         deblock_top =  (s->mb_y > 0);
2315     }
2316
2317     src_y  -=   linesize + 1;
2318     src_cb -= uvlinesize + 1;
2319     src_cr -= uvlinesize + 1;
2320
2321 #define XCHG(a,b,t,xchg)\
2322 t= a;\
2323 if(xchg)\
2324     a= b;\
2325 b= t;
2326
2327     if(deblock_left){
2328         for(i = !deblock_top; i<17; i++){
2329             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2330         }
2331     }
2332
2333     if(deblock_top){
2334         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2335         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2336         if(s->mb_x+1 < s->mb_width){
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2338         }
2339     }
2340
2341     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2342         if(deblock_left){
2343             for(i = !deblock_top; i<9; i++){
2344                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2345                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2346             }
2347         }
2348         if(deblock_top){
2349             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2350             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2351         }
2352     }
2353 }
2354
2355 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2356     MpegEncContext * const s = &h->s;
2357     int i;
2358
2359     src_y  -= 2 *   linesize;
2360     src_cb -= 2 * uvlinesize;
2361     src_cr -= 2 * uvlinesize;
2362
2363     // There are two lines saved, the line above the the top macroblock of a pair,
2364     // and the line above the bottom macroblock
2365     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2366     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2367     for(i=2; i<34; i++){
2368         h->left_border[i]= src_y[15+i*  linesize];
2369     }
2370
2371     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2372     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2373     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2374     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2375
2376     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2377         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2378         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2379         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2380         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2381         for(i=2; i<18; i++){
2382             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2383             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2384         }
2385         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2386         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2387         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2388         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2389     }
2390 }
2391
2392 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2393     MpegEncContext * const s = &h->s;
2394     int temp8, i;
2395     uint64_t temp64;
2396     int deblock_left = (s->mb_x > 0);
2397     int deblock_top  = (s->mb_y > 1);
2398
2399     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2400
2401     src_y  -= 2 *   linesize + 1;
2402     src_cb -= 2 * uvlinesize + 1;
2403     src_cr -= 2 * uvlinesize + 1;
2404
2405 #define XCHG(a,b,t,xchg)\
2406 t= a;\
2407 if(xchg)\
2408     a= b;\
2409 b= t;
2410
2411     if(deblock_left){
2412         for(i = (!deblock_top)<<1; i<34; i++){
2413             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2414         }
2415     }
2416
2417     if(deblock_top){
2418         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2419         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2420         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2421         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2422         if(s->mb_x+1 < s->mb_width){
2423             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2424             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2425         }
2426     }
2427
2428     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2429         if(deblock_left){
2430             for(i = (!deblock_top) << 1; i<18; i++){
2431                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2432                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2433             }
2434         }
2435         if(deblock_top){
2436             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2437             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2438             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2439             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2440         }
2441     }
2442 }
2443
2444 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2445     MpegEncContext * const s = &h->s;
2446     const int mb_x= s->mb_x;
2447     const int mb_y= s->mb_y;
2448     const int mb_xy= h->mb_xy;
2449     const int mb_type= s->current_picture.mb_type[mb_xy];
2450     uint8_t  *dest_y, *dest_cb, *dest_cr;
2451     int linesize, uvlinesize /*dct_offset*/;
2452     int i;
2453     int *block_offset = &h->block_offset[0];
2454     const unsigned int bottom = mb_y & 1;
2455     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2456     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2457     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2458
2459     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2460     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2461     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2462
2463     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2464     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2465
2466     if (!simple && MB_FIELD) {
2467         linesize   = h->mb_linesize   = s->linesize * 2;
2468         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2469         block_offset = &h->block_offset[24];
2470         if(mb_y&1){ //FIXME move out of this func?
2471             dest_y -= s->linesize*15;
2472             dest_cb-= s->uvlinesize*7;
2473             dest_cr-= s->uvlinesize*7;
2474         }
2475         if(FRAME_MBAFF) {
2476             int list;
2477             for(list=0; list<h->list_count; list++){
2478                 if(!USES_LIST(mb_type, list))
2479                     continue;
2480                 if(IS_16X16(mb_type)){
2481                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2482                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2483                 }else{
2484                     for(i=0; i<16; i+=4){
2485                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2486                         int ref = h->ref_cache[list][scan8[i]];
2487                         if(ref >= 0)
2488                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2489                     }
2490                 }
2491             }
2492         }
2493     } else {
2494         linesize   = h->mb_linesize   = s->linesize;
2495         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2496 //        dct_offset = s->linesize * 16;
2497     }
2498
2499     if(transform_bypass){
2500         idct_dc_add =
2501         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2502     }else if(IS_8x8DCT(mb_type)){
2503         idct_dc_add = s->dsp.h264_idct8_dc_add;
2504         idct_add = s->dsp.h264_idct8_add;
2505     }else{
2506         idct_dc_add = s->dsp.h264_idct_dc_add;
2507         idct_add = s->dsp.h264_idct_add;
2508     }
2509
2510     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2511        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2512         int mbt_y = mb_y&~1;
2513         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2514         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2515         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2516         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2517     }
2518
2519     if (!simple && IS_INTRA_PCM(mb_type)) {
2520         unsigned int x, y;
2521
2522         // The pixels are stored in h->mb array in the same order as levels,
2523         // copy them in output in the correct order.
2524         for(i=0; i<16; i++) {
2525             for (y=0; y<4; y++) {
2526                 for (x=0; x<4; x++) {
2527                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2528                 }
2529             }
2530         }
2531         for(i=16; i<16+4; i++) {
2532             for (y=0; y<4; y++) {
2533                 for (x=0; x<4; x++) {
2534                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2535                 }
2536             }
2537         }
2538         for(i=20; i<20+4; i++) {
2539             for (y=0; y<4; y++) {
2540                 for (x=0; x<4; x++) {
2541                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2542                 }
2543             }
2544         }
2545     } else {
2546         if(IS_INTRA(mb_type)){
2547             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2548                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2549
2550             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2551                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2552                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2553             }
2554
2555             if(IS_INTRA4x4(mb_type)){
2556                 if(simple || !s->encoding){
2557                     if(IS_8x8DCT(mb_type)){
2558                         for(i=0; i<16; i+=4){
2559                             uint8_t * const ptr= dest_y + block_offset[i];
2560                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2561                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2562                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2563                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2564                             if(nnz){
2565                                 if(nnz == 1 && h->mb[i*16])
2566                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2567                                 else
2568                                     idct_add(ptr, h->mb + i*16, linesize);
2569                             }
2570                         }
2571                     }else
2572                     for(i=0; i<16; i++){
2573                         uint8_t * const ptr= dest_y + block_offset[i];
2574                         uint8_t *topright;
2575                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2576                         int nnz, tr;
2577
2578                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2579                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2580                             assert(mb_y || linesize <= block_offset[i]);
2581                             if(!topright_avail){
2582                                 tr= ptr[3 - linesize]*0x01010101;
2583                                 topright= (uint8_t*) &tr;
2584                             }else
2585                                 topright= ptr + 4 - linesize;
2586                         }else
2587                             topright= NULL;
2588
2589                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2590                         nnz = h->non_zero_count_cache[ scan8[i] ];
2591                         if(nnz){
2592                             if(is_h264){
2593                                 if(nnz == 1 && h->mb[i*16])
2594                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2595                                 else
2596                                     idct_add(ptr, h->mb + i*16, linesize);
2597                             }else
2598                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2599                         }
2600                     }
2601                 }
2602             }else{
2603                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2604                 if(is_h264){
2605                     if(!transform_bypass)
2606                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2607                 }else
2608                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2609             }
2610             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2611                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2612         }else if(is_h264){
2613             hl_motion(h, dest_y, dest_cb, dest_cr,
2614                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2615                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2616                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2617         }
2618
2619
2620         if(!IS_INTRA4x4(mb_type)){
2621             if(is_h264){
2622                 if(IS_INTRA16x16(mb_type)){
2623                     for(i=0; i<16; i++){
2624                         if(h->non_zero_count_cache[ scan8[i] ])
2625                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2626                         else if(h->mb[i*16])
2627                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2628                     }
2629                 }else{
2630                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2631                     for(i=0; i<16; i+=di){
2632                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2633                         if(nnz){
2634                             if(nnz==1 && h->mb[i*16])
2635                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2636                             else
2637                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2638                         }
2639                     }
2640                 }
2641             }else{
2642                 for(i=0; i<16; i++){
2643                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2644                         uint8_t * const ptr= dest_y + block_offset[i];
2645                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2646                     }
2647                 }
2648             }
2649         }
2650
2651         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2652             uint8_t *dest[2] = {dest_cb, dest_cr};
2653             if(transform_bypass){
2654                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2655             }else{
2656                 idct_add = s->dsp.h264_idct_add;
2657                 idct_dc_add = s->dsp.h264_idct_dc_add;
2658                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2659                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2660             }
2661             if(is_h264){
2662                 for(i=16; i<16+8; i++){
2663                     if(h->non_zero_count_cache[ scan8[i] ])
2664                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2665                     else if(h->mb[i*16])
2666                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2667                 }
2668             }else{
2669                 for(i=16; i<16+8; i++){
2670                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2671                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2672                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2673                     }
2674                 }
2675             }
2676         }
2677     }
2678     if(h->deblocking_filter) {
2679         if (!simple && FRAME_MBAFF) {
2680             //FIXME try deblocking one mb at a time?
2681             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2682             const int mb_y = s->mb_y - 1;
2683             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2684             const int mb_xy= mb_x + mb_y*s->mb_stride;
2685             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2686             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2687             if (!bottom) return;
2688             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2689             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2690             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2691
2692             if(IS_INTRA(mb_type_top | mb_type_bottom))
2693                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2694
2695             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2696             // deblock a pair
2697             // top
2698             s->mb_y--; h->mb_xy -= s->mb_stride;
2699             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2700             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2701             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2702             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2703             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2704             // bottom
2705             s->mb_y++; h->mb_xy += s->mb_stride;
2706             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2707             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2708             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2709             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2710             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2711         } else {
2712             tprintf(h->s.avctx, "call filter_mb\n");
2713             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2714             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2715             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2716             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2717             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2718         }
2719     }
2720 }
2721
2722 /**
2723  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2724  */
2725 static void hl_decode_mb_simple(H264Context *h){
2726     hl_decode_mb_internal(h, 1);
2727 }
2728
2729 /**
2730  * Process a macroblock; this handles edge cases, such as interlacing.
2731  */
2732 static void av_noinline hl_decode_mb_complex(H264Context *h){
2733     hl_decode_mb_internal(h, 0);
2734 }
2735
2736 static void hl_decode_mb(H264Context *h){
2737     MpegEncContext * const s = &h->s;
2738     const int mb_xy= h->mb_xy;
2739     const int mb_type= s->current_picture.mb_type[mb_xy];
2740     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2741                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2742
2743     if(ENABLE_H264_ENCODER && !s->decode)
2744         return;
2745
2746     if (is_complex)
2747         hl_decode_mb_complex(h);
2748     else hl_decode_mb_simple(h);
2749 }
2750
2751 static void pic_as_field(Picture *pic, const int parity){
2752     int i;
2753     for (i = 0; i < 4; ++i) {
2754         if (parity == PICT_BOTTOM_FIELD)
2755             pic->data[i] += pic->linesize[i];
2756         pic->reference = parity;
2757         pic->linesize[i] *= 2;
2758     }
2759 }
2760
2761 static int split_field_copy(Picture *dest, Picture *src,
2762                             int parity, int id_add){
2763     int match = !!(src->reference & parity);
2764
2765     if (match) {
2766         *dest = *src;
2767         pic_as_field(dest, parity);
2768         dest->pic_id *= 2;
2769         dest->pic_id += id_add;
2770     }
2771
2772     return match;
2773 }
2774
2775 /**
2776  * Split one reference list into field parts, interleaving by parity
2777  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2778  * set to look at the actual start of data for that field.
2779  *
2780  * @param dest output list
2781  * @param dest_len maximum number of fields to put in dest
2782  * @param src the source reference list containing fields and/or field pairs
2783  *            (aka short_ref/long_ref, or
2784  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2785  * @param src_len number of Picture's in source (pairs and unmatched fields)
2786  * @param parity the parity of the picture being decoded/needing
2787  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2788  * @return number of fields placed in dest
2789  */
2790 static int split_field_half_ref_list(Picture *dest, int dest_len,
2791                                      Picture *src,  int src_len,  int parity){
2792     int same_parity   = 1;
2793     int same_i        = 0;
2794     int opp_i         = 0;
2795     int out_i;
2796     int field_output;
2797
2798     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2799         if (same_parity && same_i < src_len) {
2800             field_output = split_field_copy(dest + out_i, src + same_i,
2801                                             parity, 1);
2802             same_parity = !field_output;
2803             same_i++;
2804
2805         } else if (opp_i < src_len) {
2806             field_output = split_field_copy(dest + out_i, src + opp_i,
2807                                             PICT_FRAME - parity, 0);
2808             same_parity = field_output;
2809             opp_i++;
2810
2811         } else {
2812             break;
2813         }
2814     }
2815
2816     return out_i;
2817 }
2818
2819 /**
2820  * Split the reference frame list into a reference field list.
2821  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2822  * The input list contains both reference field pairs and
2823  * unmatched reference fields; it is ordered as spec describes
2824  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2825  * unmatched field pairs are also present. Conceptually this is equivalent
2826  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2827  *
2828  * @param dest output reference list where ordered fields are to be placed
2829  * @param dest_len max number of fields to place at dest
2830  * @param src source reference list, as described above
2831  * @param src_len number of pictures (pairs and unmatched fields) in src
2832  * @param parity parity of field being currently decoded
2833  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2834  * @param long_i index into src array that holds first long reference picture,
2835  *        or src_len if no long refs present.
2836  */
2837 static int split_field_ref_list(Picture *dest, int dest_len,
2838                                 Picture *src,  int src_len,
2839                                 int parity,    int long_i){
2840
2841     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2842     dest += i;
2843     dest_len -= i;
2844
2845     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2846                                    src_len - long_i, parity);
2847     return i;
2848 }
2849
2850 /**
2851  * fills the default_ref_list.
2852  */
2853 static int fill_default_ref_list(H264Context *h){
2854     MpegEncContext * const s = &h->s;
2855     int i;
2856     int smallest_poc_greater_than_current = -1;
2857     int structure_sel;
2858     Picture sorted_short_ref[32];
2859     Picture field_entry_list[2][32];
2860     Picture *frame_list[2];
2861
2862     if (FIELD_PICTURE) {
2863         structure_sel = PICT_FRAME;
2864         frame_list[0] = field_entry_list[0];
2865         frame_list[1] = field_entry_list[1];
2866     } else {
2867         structure_sel = 0;
2868         frame_list[0] = h->default_ref_list[0];
2869         frame_list[1] = h->default_ref_list[1];
2870     }
2871
2872     if(h->slice_type_nos==FF_B_TYPE){
2873         int list;
2874         int len[2];
2875         int short_len[2];
2876         int out_i;
2877         int limit= INT_MIN;
2878
2879         /* sort frame according to poc in B slice */
2880         for(out_i=0; out_i<h->short_ref_count; out_i++){
2881             int best_i=INT_MIN;
2882             int best_poc=INT_MAX;
2883
2884             for(i=0; i<h->short_ref_count; i++){
2885                 const int poc= h->short_ref[i]->poc;
2886                 if(poc > limit && poc < best_poc){
2887                     best_poc= poc;
2888                     best_i= i;
2889                 }
2890             }
2891
2892             assert(best_i != INT_MIN);
2893
2894             limit= best_poc;
2895             sorted_short_ref[out_i]= *h->short_ref[best_i];
2896             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2897             if (-1 == smallest_poc_greater_than_current) {
2898                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2899                     smallest_poc_greater_than_current = out_i;
2900                 }
2901             }
2902         }
2903
2904         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2905
2906         // find the largest poc
2907         for(list=0; list<2; list++){
2908             int index = 0;
2909             int j= -99;
2910             int step= list ? -1 : 1;
2911
2912             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2913                 int sel;
2914                 while(j<0 || j>= h->short_ref_count){
2915                     if(j != -99 && step == (list ? -1 : 1))
2916                         return -1;
2917                     step = -step;
2918                     j= smallest_poc_greater_than_current + (step>>1);
2919                 }
2920                 sel = sorted_short_ref[j].reference | structure_sel;
2921                 if(sel != PICT_FRAME) continue;
2922                 frame_list[list][index  ]= sorted_short_ref[j];
2923                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2924             }
2925             short_len[list] = index;
2926
2927             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2928                 int sel;
2929                 if(h->long_ref[i] == NULL) continue;
2930                 sel = h->long_ref[i]->reference | structure_sel;
2931                 if(sel != PICT_FRAME) continue;
2932
2933                 frame_list[ list ][index  ]= *h->long_ref[i];
2934                 frame_list[ list ][index++].pic_id= i;
2935             }
2936             len[list] = index;
2937         }
2938
2939         for(list=0; list<2; list++){
2940             if (FIELD_PICTURE)
2941                 len[list] = split_field_ref_list(h->default_ref_list[list],
2942                                                  h->ref_count[list],
2943                                                  frame_list[list],
2944                                                  len[list],
2945                                                  s->picture_structure,
2946                                                  short_len[list]);
2947
2948             // swap the two first elements of L1 when L0 and L1 are identical
2949             if(list && len[0] > 1 && len[0] == len[1])
2950                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2951                     if(i == len[0]){
2952                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2953                         break;
2954                     }
2955
2956             if(len[list] < h->ref_count[ list ])
2957                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2958         }
2959
2960
2961     }else{
2962         int index=0;
2963         int short_len;
2964         for(i=0; i<h->short_ref_count; i++){
2965             int sel;
2966             sel = h->short_ref[i]->reference | structure_sel;
2967             if(sel != PICT_FRAME) continue;
2968             frame_list[0][index  ]= *h->short_ref[i];
2969             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2970         }
2971         short_len = index;
2972         for(i = 0; i < 16; i++){
2973             int sel;
2974             if(h->long_ref[i] == NULL) continue;
2975             sel = h->long_ref[i]->reference | structure_sel;
2976             if(sel != PICT_FRAME) continue;
2977             frame_list[0][index  ]= *h->long_ref[i];
2978             frame_list[0][index++].pic_id= i;
2979         }
2980
2981         if (FIELD_PICTURE)
2982             index = split_field_ref_list(h->default_ref_list[0],
2983                                          h->ref_count[0], frame_list[0],
2984                                          index, s->picture_structure,
2985                                          short_len);
2986
2987         if(index < h->ref_count[0])
2988             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2989     }
2990 #ifdef TRACE
2991     for (i=0; i<h->ref_count[0]; i++) {
2992         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2993     }
2994     if(h->slice_type_nos==FF_B_TYPE){
2995         for (i=0; i<h->ref_count[1]; i++) {
2996             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2997         }
2998     }
2999 #endif
3000     return 0;
3001 }
3002
3003 static void print_short_term(H264Context *h);
3004 static void print_long_term(H264Context *h);
3005
3006 /**
3007  * Extract structure information about the picture described by pic_num in
3008  * the current decoding context (frame or field). Note that pic_num is
3009  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3010  * @param pic_num picture number for which to extract structure information
3011  * @param structure one of PICT_XXX describing structure of picture
3012  *                      with pic_num
3013  * @return frame number (short term) or long term index of picture
3014  *         described by pic_num
3015  */
3016 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3017     MpegEncContext * const s = &h->s;
3018
3019     *structure = s->picture_structure;
3020     if(FIELD_PICTURE){
3021         if (!(pic_num & 1))
3022             /* opposite field */
3023             *structure ^= PICT_FRAME;
3024         pic_num >>= 1;
3025     }
3026
3027     return pic_num;
3028 }
3029
3030 static int decode_ref_pic_list_reordering(H264Context *h){
3031     MpegEncContext * const s = &h->s;
3032     int list, index, pic_structure;
3033
3034     print_short_term(h);
3035     print_long_term(h);
3036     if(h->slice_type_nos==FF_I_TYPE) return 0; //FIXME move before func
3037
3038     for(list=0; list<h->list_count; list++){
3039         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3040
3041         if(get_bits1(&s->gb)){
3042             int pred= h->curr_pic_num;
3043
3044             for(index=0; ; index++){
3045                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3046                 unsigned int pic_id;
3047                 int i;
3048                 Picture *ref = NULL;
3049
3050                 if(reordering_of_pic_nums_idc==3)
3051                     break;
3052
3053                 if(index >= h->ref_count[list]){
3054                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3055                     return -1;
3056                 }
3057
3058                 if(reordering_of_pic_nums_idc<3){
3059                     if(reordering_of_pic_nums_idc<2){
3060                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3061                         int frame_num;
3062
3063                         if(abs_diff_pic_num > h->max_pic_num){
3064                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3065                             return -1;
3066                         }
3067
3068                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3069                         else                                pred+= abs_diff_pic_num;
3070                         pred &= h->max_pic_num - 1;
3071
3072                         frame_num = pic_num_extract(h, pred, &pic_structure);
3073
3074                         for(i= h->short_ref_count-1; i>=0; i--){
3075                             ref = h->short_ref[i];
3076                             assert(ref->reference);
3077                             assert(!ref->long_ref);
3078                             if(ref->data[0] != NULL &&
3079                                    ref->frame_num == frame_num &&
3080                                    (ref->reference & pic_structure) &&
3081                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3082                                 break;
3083                         }
3084                         if(i>=0)
3085                             ref->pic_id= pred;
3086                     }else{
3087                         int long_idx;
3088                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3089
3090                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3091
3092                         if(long_idx>31){
3093                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3094                             return -1;
3095                         }
3096                         ref = h->long_ref[long_idx];
3097                         assert(!(ref && !ref->reference));
3098                         if(ref && (ref->reference & pic_structure)){
3099                             ref->pic_id= pic_id;
3100                             assert(ref->long_ref);
3101                             i=0;
3102                         }else{
3103                             i=-1;
3104                         }
3105                     }
3106
3107                     if (i < 0) {
3108                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3109                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3110                     } else {
3111                         for(i=index; i+1<h->ref_count[list]; i++){
3112                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3113                                 break;
3114                         }
3115                         for(; i > index; i--){
3116                             h->ref_list[list][i]= h->ref_list[list][i-1];
3117                         }
3118                         h->ref_list[list][index]= *ref;
3119                         if (FIELD_PICTURE){
3120                             pic_as_field(&h->ref_list[list][index], pic_structure);
3121                         }
3122                     }
3123                 }else{
3124                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3125                     return -1;
3126                 }
3127             }
3128         }
3129     }
3130     for(list=0; list<h->list_count; list++){
3131         for(index= 0; index < h->ref_count[list]; index++){
3132             if(!h->ref_list[list][index].data[0])
3133                 h->ref_list[list][index]= s->current_picture;
3134         }
3135     }
3136
3137     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3138         direct_dist_scale_factor(h);
3139     direct_ref_list_init(h);
3140     return 0;
3141 }
3142
3143 static void fill_mbaff_ref_list(H264Context *h){
3144     int list, i, j;
3145     for(list=0; list<2; list++){ //FIXME try list_count
3146         for(i=0; i<h->ref_count[list]; i++){
3147             Picture *frame = &h->ref_list[list][i];
3148             Picture *field = &h->ref_list[list][16+2*i];
3149             field[0] = *frame;
3150             for(j=0; j<3; j++)
3151                 field[0].linesize[j] <<= 1;
3152             field[0].reference = PICT_TOP_FIELD;
3153             field[1] = field[0];
3154             for(j=0; j<3; j++)
3155                 field[1].data[j] += frame->linesize[j];
3156             field[1].reference = PICT_BOTTOM_FIELD;
3157
3158             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3159             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3160             for(j=0; j<2; j++){
3161                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3162                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3163             }
3164         }
3165     }
3166     for(j=0; j<h->ref_count[1]; j++){
3167         for(i=0; i<h->ref_count[0]; i++)
3168             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3169         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3170         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3171     }
3172 }
3173
3174 static int pred_weight_table(H264Context *h){
3175     MpegEncContext * const s = &h->s;
3176     int list, i;
3177     int luma_def, chroma_def;
3178
3179     h->use_weight= 0;
3180     h->use_weight_chroma= 0;
3181     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3182     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3183     luma_def = 1<<h->luma_log2_weight_denom;
3184     chroma_def = 1<<h->chroma_log2_weight_denom;
3185
3186     for(list=0; list<2; list++){
3187         for(i=0; i<h->ref_count[list]; i++){
3188             int luma_weight_flag, chroma_weight_flag;
3189
3190             luma_weight_flag= get_bits1(&s->gb);
3191             if(luma_weight_flag){
3192                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3193                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3194                 if(   h->luma_weight[list][i] != luma_def
3195                    || h->luma_offset[list][i] != 0)
3196                     h->use_weight= 1;
3197             }else{
3198                 h->luma_weight[list][i]= luma_def;
3199                 h->luma_offset[list][i]= 0;
3200             }
3201
3202             chroma_weight_flag= get_bits1(&s->gb);
3203             if(chroma_weight_flag){
3204                 int j;
3205                 for(j=0; j<2; j++){
3206                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3207                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3208                     if(   h->chroma_weight[list][i][j] != chroma_def
3209                        || h->chroma_offset[list][i][j] != 0)
3210                         h->use_weight_chroma= 1;
3211                 }
3212             }else{
3213                 int j;
3214                 for(j=0; j<2; j++){
3215                     h->chroma_weight[list][i][j]= chroma_def;
3216                     h->chroma_offset[list][i][j]= 0;
3217                 }
3218             }
3219         }
3220         if(h->slice_type_nos != FF_B_TYPE) break;
3221     }
3222     h->use_weight= h->use_weight || h->use_weight_chroma;
3223     return 0;
3224 }
3225
3226 static void implicit_weight_table(H264Context *h){
3227     MpegEncContext * const s = &h->s;
3228     int ref0, ref1;
3229     int cur_poc = s->current_picture_ptr->poc;
3230
3231     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3232        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3233         h->use_weight= 0;
3234         h->use_weight_chroma= 0;
3235         return;
3236     }
3237
3238     h->use_weight= 2;
3239     h->use_weight_chroma= 2;
3240     h->luma_log2_weight_denom= 5;
3241     h->chroma_log2_weight_denom= 5;
3242
3243     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3244         int poc0 = h->ref_list[0][ref0].poc;
3245         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3246             int poc1 = h->ref_list[1][ref1].poc;
3247             int td = av_clip(poc1 - poc0, -128, 127);
3248             if(td){
3249                 int tb = av_clip(cur_poc - poc0, -128, 127);
3250                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3251                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3252                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3253                     h->implicit_weight[ref0][ref1] = 32;
3254                 else
3255                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3256             }else
3257                 h->implicit_weight[ref0][ref1] = 32;
3258         }
3259     }
3260 }
3261
3262 /**
3263  * Mark a picture as no longer needed for reference. The refmask
3264  * argument allows unreferencing of individual fields or the whole frame.
3265  * If the picture becomes entirely unreferenced, but is being held for
3266  * display purposes, it is marked as such.
3267  * @param refmask mask of fields to unreference; the mask is bitwise
3268  *                anded with the reference marking of pic
3269  * @return non-zero if pic becomes entirely unreferenced (except possibly
3270  *         for display purposes) zero if one of the fields remains in
3271  *         reference
3272  */
3273 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3274     int i;
3275     if (pic->reference &= refmask) {
3276         return 0;
3277     } else {
3278         for(i = 0; h->delayed_pic[i]; i++)
3279             if(pic == h->delayed_pic[i]){
3280                 pic->reference=DELAYED_PIC_REF;
3281                 break;
3282             }
3283         return 1;
3284     }
3285 }
3286
3287 /**
3288  * instantaneous decoder refresh.
3289  */
3290 static void idr(H264Context *h){
3291     int i;
3292
3293     for(i=0; i<16; i++){
3294         if (h->long_ref[i] != NULL) {
3295             unreference_pic(h, h->long_ref[i], 0);
3296             h->long_ref[i]= NULL;
3297         }
3298     }
3299     h->long_ref_count=0;
3300
3301     for(i=0; i<h->short_ref_count; i++){
3302         unreference_pic(h, h->short_ref[i], 0);
3303         h->short_ref[i]= NULL;
3304     }
3305     h->short_ref_count=0;
3306 }
3307
3308 /* forget old pics after a seek */
3309 static void flush_dpb(AVCodecContext *avctx){
3310     H264Context *h= avctx->priv_data;
3311     int i;
3312     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3313         if(h->delayed_pic[i])
3314             h->delayed_pic[i]->reference= 0;
3315         h->delayed_pic[i]= NULL;
3316     }
3317     h->outputed_poc= INT_MIN;
3318     idr(h);
3319     if(h->s.current_picture_ptr)
3320         h->s.current_picture_ptr->reference= 0;
3321     h->s.first_field= 0;
3322     ff_mpeg_flush(avctx);
3323 }
3324
3325 /**
3326  * Find a Picture in the short term reference list by frame number.
3327  * @param frame_num frame number to search for
3328  * @param idx the index into h->short_ref where returned picture is found
3329  *            undefined if no picture found.
3330  * @return pointer to the found picture, or NULL if no pic with the provided
3331  *                 frame number is found
3332  */
3333 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3334     MpegEncContext * const s = &h->s;
3335     int i;
3336
3337     for(i=0; i<h->short_ref_count; i++){
3338         Picture *pic= h->short_ref[i];
3339         if(s->avctx->debug&FF_DEBUG_MMCO)
3340             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3341         if(pic->frame_num == frame_num) {
3342             *idx = i;
3343             return pic;
3344         }
3345     }
3346     return NULL;
3347 }
3348
3349 /**
3350  * Remove a picture from the short term reference list by its index in
3351  * that list.  This does no checking on the provided index; it is assumed
3352  * to be valid. Other list entries are shifted down.
3353  * @param i index into h->short_ref of picture to remove.
3354  */
3355 static void remove_short_at_index(H264Context *h, int i){
3356     assert(i >= 0 && i < h->short_ref_count);
3357     h->short_ref[i]= NULL;
3358     if (--h->short_ref_count)
3359         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3360 }
3361
3362 /**
3363  *
3364  * @return the removed picture or NULL if an error occurs
3365  */
3366 static Picture * remove_short(H264Context *h, int frame_num){
3367     MpegEncContext * const s = &h->s;
3368     Picture *pic;
3369     int i;
3370
3371     if(s->avctx->debug&FF_DEBUG_MMCO)
3372         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3373
3374     pic = find_short(h, frame_num, &i);
3375     if (pic)
3376         remove_short_at_index(h, i);
3377
3378     return pic;
3379 }
3380
3381 /**
3382  * Remove a picture from the long term reference list by its index in
3383  * that list.  This does no checking on the provided index; it is assumed
3384  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3385  * @param i index into h->long_ref of picture to remove.
3386  */
3387 static void remove_long_at_index(H264Context *h, int i){
3388     h->long_ref[i]= NULL;
3389     h->long_ref_count--;
3390 }
3391
3392 /**
3393  *
3394  * @return the removed picture or NULL if an error occurs
3395  */
3396 static Picture * remove_long(H264Context *h, int i){
3397     Picture *pic;
3398
3399     pic= h->long_ref[i];
3400     if (pic)
3401         remove_long_at_index(h, i);
3402
3403     return pic;
3404 }
3405
3406 /**
3407  * print short term list
3408  */
3409 static void print_short_term(H264Context *h) {
3410     uint32_t i;
3411     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3412         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3413         for(i=0; i<h->short_ref_count; i++){
3414             Picture *pic= h->short_ref[i];
3415             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3416         }
3417     }
3418 }
3419
3420 /**
3421  * print long term list
3422  */
3423 static void print_long_term(H264Context *h) {
3424     uint32_t i;
3425     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3426         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3427         for(i = 0; i < 16; i++){
3428             Picture *pic= h->long_ref[i];
3429             if (pic) {
3430                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3431             }
3432         }
3433     }
3434 }
3435
3436 /**
3437  * Executes the reference picture marking (memory management control operations).
3438  */
3439 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3440     MpegEncContext * const s = &h->s;
3441     int i, j;
3442     int current_ref_assigned=0;
3443     Picture *pic;
3444
3445     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3446         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3447
3448     for(i=0; i<mmco_count; i++){
3449         int structure, frame_num, unref_pic;
3450         if(s->avctx->debug&FF_DEBUG_MMCO)
3451             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3452
3453         switch(mmco[i].opcode){
3454         case MMCO_SHORT2UNUSED:
3455             if(s->avctx->debug&FF_DEBUG_MMCO)
3456                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3457             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3458             pic = find_short(h, frame_num, &j);
3459             if (pic) {
3460                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3461                     remove_short_at_index(h, j);
3462             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3463                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3464             break;
3465         case MMCO_SHORT2LONG:
3466             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3467                     h->long_ref[mmco[i].long_arg]->frame_num ==
3468                                               mmco[i].short_pic_num / 2) {
3469                 /* do nothing, we've already moved this field pair. */
3470             } else {
3471                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3472
3473                 pic= remove_long(h, mmco[i].long_arg);
3474                 if(pic) unreference_pic(h, pic, 0);
3475
3476                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3477                 if (h->long_ref[ mmco[i].long_arg ]){
3478                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3479                     h->long_ref_count++;
3480                 }
3481             }
3482             break;
3483         case MMCO_LONG2UNUSED:
3484             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3485             pic = h->long_ref[j];
3486             if (pic) {
3487                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3488                     remove_long_at_index(h, j);
3489             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3490                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3491             break;
3492         case MMCO_LONG:
3493             unref_pic = 1;
3494             if (FIELD_PICTURE && !s->first_field) {
3495                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3496                     /* Just mark second field as referenced */
3497                     unref_pic = 0;
3498                 } else if (s->current_picture_ptr->reference) {
3499                     /* First field in pair is in short term list or
3500                      * at a different long term index.
3501                      * This is not allowed; see 7.4.3, notes 2 and 3.
3502                      * Report the problem and keep the pair where it is,
3503                      * and mark this field valid.
3504                      */
3505                     av_log(h->s.avctx, AV_LOG_ERROR,
3506                         "illegal long term reference assignment for second "
3507                         "field in complementary field pair (first field is "
3508                         "short term or has non-matching long index)\n");
3509                     unref_pic = 0;
3510                 }
3511             }
3512
3513             if (unref_pic) {
3514                 pic= remove_long(h, mmco[i].long_arg);
3515                 if(pic) unreference_pic(h, pic, 0);
3516
3517                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3518                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3519                 h->long_ref_count++;
3520             }
3521
3522             s->current_picture_ptr->reference |= s->picture_structure;
3523             current_ref_assigned=1;
3524             break;
3525         case MMCO_SET_MAX_LONG:
3526             assert(mmco[i].long_arg <= 16);
3527             // just remove the long term which index is greater than new max
3528             for(j = mmco[i].long_arg; j<16; j++){
3529                 pic = remove_long(h, j);
3530                 if (pic) unreference_pic(h, pic, 0);
3531             }
3532             break;
3533         case MMCO_RESET:
3534             while(h->short_ref_count){
3535                 pic= remove_short(h, h->short_ref[0]->frame_num);
3536                 if(pic) unreference_pic(h, pic, 0);
3537             }
3538             for(j = 0; j < 16; j++) {
3539                 pic= remove_long(h, j);
3540                 if(pic) unreference_pic(h, pic, 0);
3541             }
3542             s->current_picture_ptr->poc=
3543             s->current_picture_ptr->field_poc[0]=
3544             s->current_picture_ptr->field_poc[1]=
3545             h->poc_lsb=
3546             h->poc_msb=
3547             h->frame_num=
3548             s->current_picture_ptr->frame_num= 0;
3549             break;
3550         default: assert(0);
3551         }
3552     }
3553
3554     if (!current_ref_assigned && FIELD_PICTURE &&
3555             !s->first_field && s->current_picture_ptr->reference) {
3556
3557         /* Second field of complementary field pair; the first field of
3558          * which is already referenced. If short referenced, it
3559          * should be first entry in short_ref. If not, it must exist
3560          * in long_ref; trying to put it on the short list here is an
3561          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3562          */
3563         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3564             /* Just mark the second field valid */
3565             s->current_picture_ptr->reference = PICT_FRAME;
3566         } else if (s->current_picture_ptr->long_ref) {
3567             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3568                                              "assignment for second field "
3569                                              "in complementary field pair "
3570                                              "(first field is long term)\n");
3571         } else {
3572             /*
3573              * First field in reference, but not in any sensible place on our
3574              * reference lists. This shouldn't happen unless reference
3575              * handling somewhere else is wrong.
3576              */
3577             assert(0);
3578         }
3579         current_ref_assigned = 1;
3580     }
3581
3582     if(!current_ref_assigned){
3583         pic= remove_short(h, s->current_picture_ptr->frame_num);
3584         if(pic){
3585             unreference_pic(h, pic, 0);
3586             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3587         }
3588
3589         if(h->short_ref_count)
3590             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3591
3592         h->short_ref[0]= s->current_picture_ptr;
3593         h->short_ref[0]->long_ref=0;
3594         h->short_ref_count++;
3595         s->current_picture_ptr->reference |= s->picture_structure;
3596     }
3597
3598     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3599
3600         /* We have too many reference frames, probably due to corrupted
3601          * stream. Need to discard one frame. Prevents overrun of the
3602          * short_ref and long_ref buffers.
3603          */
3604         av_log(h->s.avctx, AV_LOG_ERROR,
3605                "number of reference frames exceeds max (probably "
3606                "corrupt input), discarding one\n");
3607
3608         if (h->long_ref_count && !h->short_ref_count) {
3609             for (i = 0; i < 16; ++i)
3610                 if (h->long_ref[i])
3611                     break;
3612
3613             assert(i < 16);
3614             pic = h->long_ref[i];
3615             remove_long_at_index(h, i);
3616         } else {
3617             pic = h->short_ref[h->short_ref_count - 1];
3618             remove_short_at_index(h, h->short_ref_count - 1);
3619         }
3620         unreference_pic(h, pic, 0);
3621     }
3622
3623     print_short_term(h);
3624     print_long_term(h);
3625     return 0;
3626 }
3627
3628 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3629     MpegEncContext * const s = &h->s;
3630     int i;
3631
3632     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3633         s->broken_link= get_bits1(gb) -1;
3634         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3635         if(h->mmco[0].long_arg == -1)
3636             h->mmco_index= 0;
3637         else{
3638             h->mmco[0].opcode= MMCO_LONG;
3639             h->mmco_index= 1;
3640         }
3641     }else{
3642         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3643             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3644                 MMCOOpcode opcode= get_ue_golomb(gb);
3645
3646                 h->mmco[i].opcode= opcode;
3647                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3648                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3649 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3650                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3651                         return -1;
3652                     }*/
3653                 }
3654                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3655                     unsigned int long_arg= get_ue_golomb(gb);
3656                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3657                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3658                         return -1;
3659                     }
3660                     h->mmco[i].long_arg= long_arg;
3661                 }
3662
3663                 if(opcode > (unsigned)MMCO_LONG){
3664                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3665                     return -1;
3666                 }
3667                 if(opcode == MMCO_END)
3668                     break;
3669             }
3670             h->mmco_index= i;
3671         }else{
3672             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3673
3674             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3675                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3676                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3677                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3678                 h->mmco_index= 1;
3679                 if (FIELD_PICTURE) {
3680                     h->mmco[0].short_pic_num *= 2;
3681                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3682                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3683                     h->mmco_index= 2;
3684                 }
3685             }else
3686                 h->mmco_index= 0;
3687         }
3688     }
3689
3690     return 0;
3691 }
3692
3693 static int init_poc(H264Context *h){
3694     MpegEncContext * const s = &h->s;
3695     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3696     int field_poc[2];
3697
3698     if(h->nal_unit_type == NAL_IDR_SLICE){
3699         h->frame_num_offset= 0;
3700     }else{
3701         if(h->frame_num < h->prev_frame_num)
3702             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3703         else
3704             h->frame_num_offset= h->prev_frame_num_offset;
3705     }
3706
3707     if(h->sps.poc_type==0){
3708         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3709
3710         if(h->nal_unit_type == NAL_IDR_SLICE){
3711              h->prev_poc_msb=
3712              h->prev_poc_lsb= 0;
3713         }
3714
3715         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3716             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3717         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3718             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3719         else
3720             h->poc_msb = h->prev_poc_msb;
3721 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3722         field_poc[0] =
3723         field_poc[1] = h->poc_msb + h->poc_lsb;
3724         if(s->picture_structure == PICT_FRAME)
3725             field_poc[1] += h->delta_poc_bottom;
3726     }else if(h->sps.poc_type==1){
3727         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3728         int i;
3729
3730         if(h->sps.poc_cycle_length != 0)
3731             abs_frame_num = h->frame_num_offset + h->frame_num;
3732         else
3733             abs_frame_num = 0;
3734
3735         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3736             abs_frame_num--;
3737
3738         expected_delta_per_poc_cycle = 0;
3739         for(i=0; i < h->sps.poc_cycle_length; i++)
3740             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3741
3742         if(abs_frame_num > 0){
3743             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3744             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3745
3746             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3747             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3748                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3749         } else
3750             expectedpoc = 0;
3751
3752         if(h->nal_ref_idc == 0)
3753             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3754
3755         field_poc[0] = expectedpoc + h->delta_poc[0];
3756         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3757
3758         if(s->picture_structure == PICT_FRAME)
3759             field_poc[1] += h->delta_poc[1];
3760     }else{
3761         int poc;
3762         if(h->nal_unit_type == NAL_IDR_SLICE){
3763             poc= 0;
3764         }else{
3765             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3766             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3767         }
3768         field_poc[0]= poc;
3769         field_poc[1]= poc;
3770     }
3771
3772     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3773         s->current_picture_ptr->field_poc[0]= field_poc[0];
3774         s->current_picture_ptr->poc = field_poc[0];
3775     }
3776     if(s->picture_structure != PICT_TOP_FIELD) {
3777         s->current_picture_ptr->field_poc[1]= field_poc[1];
3778         s->current_picture_ptr->poc = field_poc[1];
3779     }
3780     if(!FIELD_PICTURE || !s->first_field) {
3781         Picture *cur = s->current_picture_ptr;
3782         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3783     }
3784
3785     return 0;
3786 }
3787
3788
3789 /**
3790  * initialize scan tables
3791  */
3792 static void init_scan_tables(H264Context *h){
3793     MpegEncContext * const s = &h->s;
3794     int i;
3795     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3796         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3797         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3798     }else{
3799         for(i=0; i<16; i++){
3800 #define T(x) (x>>2) | ((x<<2) & 0xF)
3801             h->zigzag_scan[i] = T(zigzag_scan[i]);
3802             h-> field_scan[i] = T( field_scan[i]);
3803 #undef T
3804         }
3805     }
3806     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3807         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3808         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3809         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3810         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3811     }else{
3812         for(i=0; i<64; i++){
3813 #define T(x) (x>>3) | ((x&7)<<3)
3814             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3815             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3816             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3817             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3818 #undef T
3819         }
3820     }
3821     if(h->sps.transform_bypass){ //FIXME same ugly
3822         h->zigzag_scan_q0          = zigzag_scan;
3823         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3824         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3825         h->field_scan_q0           = field_scan;
3826         h->field_scan8x8_q0        = field_scan8x8;
3827         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3828     }else{
3829         h->zigzag_scan_q0          = h->zigzag_scan;
3830         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3831         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3832         h->field_scan_q0           = h->field_scan;
3833         h->field_scan8x8_q0        = h->field_scan8x8;
3834         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3835     }
3836 }
3837
3838 /**
3839  * Replicates H264 "master" context to thread contexts.
3840  */
3841 static void clone_slice(H264Context *dst, H264Context *src)
3842 {
3843     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3844     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3845     dst->s.current_picture      = src->s.current_picture;
3846     dst->s.linesize             = src->s.linesize;
3847     dst->s.uvlinesize           = src->s.uvlinesize;
3848     dst->s.first_field          = src->s.first_field;
3849
3850     dst->prev_poc_msb           = src->prev_poc_msb;
3851     dst->prev_poc_lsb           = src->prev_poc_lsb;
3852     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3853     dst->prev_frame_num         = src->prev_frame_num;
3854     dst->short_ref_count        = src->short_ref_count;
3855
3856     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3857     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3858     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3859     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3860
3861     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3862     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3863 }
3864
3865 /**
3866  * decodes a slice header.
3867  * This will also call MPV_common_init() and frame_start() as needed.
3868  *
3869  * @param h h264context
3870  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3871  *
3872  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3873  */
3874 static int decode_slice_header(H264Context *h, H264Context *h0){
3875     MpegEncContext * const s = &h->s;
3876     MpegEncContext * const s0 = &h0->s;
3877     unsigned int first_mb_in_slice;
3878     unsigned int pps_id;
3879     int num_ref_idx_active_override_flag;
3880     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3881     unsigned int slice_type, tmp, i, j;
3882     int default_ref_list_done = 0;
3883     int last_pic_structure;
3884
3885     s->dropable= h->nal_ref_idc == 0;
3886
3887     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3888         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3889         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3890     }else{
3891         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3892         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3893     }
3894
3895     first_mb_in_slice= get_ue_golomb(&s->gb);
3896
3897     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3898         h0->current_slice = 0;
3899         if (!s0->first_field)
3900             s->current_picture_ptr= NULL;
3901     }
3902
3903     slice_type= get_ue_golomb(&s->gb);
3904     if(slice_type > 9){
3905         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3906         return -1;
3907     }
3908     if(slice_type > 4){
3909         slice_type -= 5;
3910         h->slice_type_fixed=1;
3911     }else
3912         h->slice_type_fixed=0;
3913
3914     slice_type= slice_type_map[ slice_type ];
3915     if (slice_type == FF_I_TYPE
3916         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3917         default_ref_list_done = 1;
3918     }
3919     h->slice_type= slice_type;
3920     h->slice_type_nos= slice_type & 3;
3921
3922     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3923     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3924         av_log(h->s.avctx, AV_LOG_ERROR,
3925                "B picture before any references, skipping\n");
3926         return -1;
3927     }
3928
3929     pps_id= get_ue_golomb(&s->gb);
3930     if(pps_id>=MAX_PPS_COUNT){
3931         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3932         return -1;
3933     }
3934     if(!h0->pps_buffers[pps_id]) {
3935         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3936         return -1;
3937     }
3938     h->pps= *h0->pps_buffers[pps_id];
3939
3940     if(!h0->sps_buffers[h->pps.sps_id]) {
3941         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3942         return -1;
3943     }
3944     h->sps = *h0->sps_buffers[h->pps.sps_id];
3945
3946     if(h == h0 && h->dequant_coeff_pps != pps_id){
3947         h->dequant_coeff_pps = pps_id;
3948         init_dequant_tables(h);
3949     }
3950
3951     s->mb_width= h->sps.mb_width;
3952     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3953
3954     h->b_stride=  s->mb_width*4;
3955     h->b8_stride= s->mb_width*2;
3956
3957     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3958     if(h->sps.frame_mbs_only_flag)
3959         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3960     else
3961         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3962
3963     if (s->context_initialized
3964         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3965         if(h != h0)
3966             return -1;   // width / height changed during parallelized decoding
3967         free_tables(h);
3968         MPV_common_end(s);
3969     }
3970     if (!s->context_initialized) {
3971         if(h != h0)
3972             return -1;  // we cant (re-)initialize context during parallel decoding
3973         if (MPV_common_init(s) < 0)
3974             return -1;
3975         s->first_field = 0;
3976
3977         init_scan_tables(h);
3978         alloc_tables(h);
3979
3980         for(i = 1; i < s->avctx->thread_count; i++) {
3981             H264Context *c;
3982             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3983             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3984             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3985             c->sps = h->sps;
3986             c->pps = h->pps;
3987             init_scan_tables(c);
3988             clone_tables(c, h);
3989         }
3990
3991         for(i = 0; i < s->avctx->thread_count; i++)
3992             if(context_init(h->thread_context[i]) < 0)
3993                 return -1;
3994
3995         s->avctx->width = s->width;
3996         s->avctx->height = s->height;
3997         s->avctx->sample_aspect_ratio= h->sps.sar;
3998         if(!s->avctx->sample_aspect_ratio.den)
3999             s->avctx->sample_aspect_ratio.den = 1;
4000
4001         if(h->sps.timing_info_present_flag){
4002             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4003             if(h->x264_build > 0 && h->x264_build < 44)
4004                 s->avctx->time_base.den *= 2;
4005             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4006                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4007         }
4008     }
4009
4010     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4011
4012     h->mb_mbaff = 0;
4013     h->mb_aff_frame = 0;
4014     last_pic_structure = s0->picture_structure;
4015     if(h->sps.frame_mbs_only_flag){
4016         s->picture_structure= PICT_FRAME;
4017     }else{
4018         if(get_bits1(&s->gb)) { //field_pic_flag
4019             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4020         } else {
4021             s->picture_structure= PICT_FRAME;
4022             h->mb_aff_frame = h->sps.mb_aff;
4023         }
4024     }
4025
4026     if(h0->current_slice == 0){
4027         while(h->frame_num !=  h->prev_frame_num &&
4028               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
4029             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
4030             frame_start(h);
4031             h->prev_frame_num++;
4032             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
4033             s->current_picture_ptr->frame_num= h->prev_frame_num;
4034             execute_ref_pic_marking(h, NULL, 0);
4035         }
4036
4037         /* See if we have a decoded first field looking for a pair... */
4038         if (s0->first_field) {
4039             assert(s0->current_picture_ptr);
4040             assert(s0->current_picture_ptr->data[0]);
4041             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4042
4043             /* figure out if we have a complementary field pair */
4044             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4045                 /*
4046                  * Previous field is unmatched. Don't display it, but let it
4047                  * remain for reference if marked as such.
4048                  */
4049                 s0->current_picture_ptr = NULL;
4050                 s0->first_field = FIELD_PICTURE;
4051
4052             } else {
4053                 if (h->nal_ref_idc &&
4054                         s0->current_picture_ptr->reference &&
4055                         s0->current_picture_ptr->frame_num != h->frame_num) {
4056                     /*
4057                      * This and previous field were reference, but had
4058                      * different frame_nums. Consider this field first in
4059                      * pair. Throw away previous field except for reference
4060                      * purposes.
4061                      */
4062                     s0->first_field = 1;
4063                     s0->current_picture_ptr = NULL;
4064
4065                 } else {
4066                     /* Second field in complementary pair */
4067                     s0->first_field = 0;
4068                 }
4069             }
4070
4071         } else {
4072             /* Frame or first field in a potentially complementary pair */
4073             assert(!s0->current_picture_ptr);
4074             s0->first_field = FIELD_PICTURE;
4075         }
4076
4077         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4078             s0->first_field = 0;
4079             return -1;
4080         }
4081     }
4082     if(h != h0)
4083         clone_slice(h, h0);
4084
4085     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4086
4087     assert(s->mb_num == s->mb_width * s->mb_height);
4088     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4089        first_mb_in_slice                    >= s->mb_num){
4090         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4091         return -1;
4092     }
4093     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4094     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4095     if (s->picture_structure == PICT_BOTTOM_FIELD)
4096         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4097     assert(s->mb_y < s->mb_height);
4098
4099     if(s->picture_structure==PICT_FRAME){
4100         h->curr_pic_num=   h->frame_num;
4101         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4102     }else{
4103         h->curr_pic_num= 2*h->frame_num + 1;
4104         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4105     }
4106
4107     if(h->nal_unit_type == NAL_IDR_SLICE){
4108         get_ue_golomb(&s->gb); /* idr_pic_id */
4109     }
4110
4111     if(h->sps.poc_type==0){
4112         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4113
4114         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4115             h->delta_poc_bottom= get_se_golomb(&s->gb);
4116         }
4117     }
4118
4119     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4120         h->delta_poc[0]= get_se_golomb(&s->gb);
4121
4122         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4123             h->delta_poc[1]= get_se_golomb(&s->gb);
4124     }
4125
4126     init_poc(h);
4127
4128     if(h->pps.redundant_pic_cnt_present){
4129         h->redundant_pic_count= get_ue_golomb(&s->gb);
4130     }
4131
4132     //set defaults, might be overriden a few line later
4133     h->ref_count[0]= h->pps.ref_count[0];
4134     h->ref_count[1]= h->pps.ref_count[1];
4135
4136     if(h->slice_type_nos != FF_I_TYPE){
4137         if(h->slice_type_nos == FF_B_TYPE){
4138             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4139         }
4140         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4141
4142         if(num_ref_idx_active_override_flag){
4143             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4144             if(h->slice_type_nos==FF_B_TYPE)
4145                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4146
4147             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4148                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4149                 h->ref_count[0]= h->ref_count[1]= 1;
4150                 return -1;
4151             }
4152         }
4153         if(h->slice_type_nos == FF_B_TYPE)
4154             h->list_count= 2;
4155         else
4156             h->list_count= 1;
4157     }else
4158         h->list_count= 0;
4159
4160     if(!default_ref_list_done){
4161         fill_default_ref_list(h);
4162     }
4163
4164     if(decode_ref_pic_list_reordering(h) < 0)
4165         return -1;
4166
4167     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4168        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4169         pred_weight_table(h);
4170     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4171         implicit_weight_table(h);
4172     else
4173         h->use_weight = 0;
4174
4175     if(h->nal_ref_idc)
4176         decode_ref_pic_marking(h0, &s->gb);
4177
4178     if(FRAME_MBAFF)
4179         fill_mbaff_ref_list(h);
4180
4181     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4182         tmp = get_ue_golomb(&s->gb);
4183         if(tmp > 2){
4184             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4185             return -1;
4186         }
4187         h->cabac_init_idc= tmp;
4188     }
4189
4190     h->last_qscale_diff = 0;
4191     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4192     if(tmp>51){
4193         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4194         return -1;
4195     }
4196     s->qscale= tmp;
4197     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4198     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4199     //FIXME qscale / qp ... stuff
4200     if(h->slice_type == FF_SP_TYPE){
4201         get_bits1(&s->gb); /* sp_for_switch_flag */
4202     }
4203     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4204         get_se_golomb(&s->gb); /* slice_qs_delta */
4205     }
4206
4207     h->deblocking_filter = 1;
4208     h->slice_alpha_c0_offset = 0;
4209     h->slice_beta_offset = 0;
4210     if( h->pps.deblocking_filter_parameters_present ) {
4211         tmp= get_ue_golomb(&s->gb);
4212         if(tmp > 2){
4213             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4214             return -1;
4215         }
4216         h->deblocking_filter= tmp;
4217         if(h->deblocking_filter < 2)
4218             h->deblocking_filter^= 1; // 1<->0
4219
4220         if( h->deblocking_filter ) {
4221             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4222             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4223         }
4224     }
4225
4226     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4227        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4228        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4229        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4230         h->deblocking_filter= 0;
4231
4232     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4233         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4234             /* Cheat slightly for speed:
4235                Do not bother to deblock across slices. */
4236             h->deblocking_filter = 2;
4237         } else {
4238             h0->max_contexts = 1;
4239             if(!h0->single_decode_warning) {
4240                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4241                 h0->single_decode_warning = 1;
4242             }
4243             if(h != h0)
4244                 return 1; // deblocking switched inside frame
4245         }
4246     }
4247
4248 #if 0 //FMO
4249     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4250         slice_group_change_cycle= get_bits(&s->gb, ?);
4251 #endif
4252
4253     h0->last_slice_type = slice_type;
4254     h->slice_num = ++h0->current_slice;
4255
4256     for(j=0; j<2; j++){
4257         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4258         ref2frm[0]=
4259         ref2frm[1]= -1;
4260         for(i=0; i<48; i++)
4261             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4262                           +(h->ref_list[j][i].reference&3);
4263     }
4264
4265     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4266     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4267
4268     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4269         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4270                h->slice_num,
4271                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4272                first_mb_in_slice,
4273                av_get_pict_type_char(h->slice_type),
4274                pps_id, h->frame_num,
4275                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4276                h->ref_count[0], h->ref_count[1],
4277                s->qscale,
4278                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4279                h->use_weight,
4280                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4281                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4282                );
4283     }
4284
4285     return 0;
4286 }
4287
4288 /**
4289  *
4290  */
4291 static inline int get_level_prefix(GetBitContext *gb){
4292     unsigned int buf;
4293     int log;
4294
4295     OPEN_READER(re, gb);
4296     UPDATE_CACHE(re, gb);
4297     buf=GET_CACHE(re, gb);
4298
4299     log= 32 - av_log2(buf);
4300 #ifdef TRACE
4301     print_bin(buf>>(32-log), log);
4302     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4303 #endif
4304
4305     LAST_SKIP_BITS(re, gb, log);
4306     CLOSE_READER(re, gb);
4307
4308     return log-1;
4309 }
4310
4311 static inline int get_dct8x8_allowed(H264Context *h){
4312     int i;
4313     for(i=0; i<4; i++){
4314         if(!IS_SUB_8X8(h->sub_mb_type[i])
4315            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4316             return 0;
4317     }
4318     return 1;
4319 }
4320
4321 /**
4322  * decodes a residual block.
4323  * @param n block index
4324  * @param scantable scantable
4325  * @param max_coeff number of coefficients in the block
4326  * @return <0 if an error occurred
4327  */
4328 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4329     MpegEncContext * const s = &h->s;
4330     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4331     int level[16];
4332     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4333
4334     //FIXME put trailing_onex into the context
4335
4336     if(n == CHROMA_DC_BLOCK_INDEX){
4337         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4338         total_coeff= coeff_token>>2;
4339     }else{
4340         if(n == LUMA_DC_BLOCK_INDEX){
4341             total_coeff= pred_non_zero_count(h, 0);
4342             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4343             total_coeff= coeff_token>>2;
4344         }else{
4345             total_coeff= pred_non_zero_count(h, n);
4346             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4347             total_coeff= coeff_token>>2;
4348             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4349         }
4350     }
4351
4352     //FIXME set last_non_zero?
4353
4354     if(total_coeff==0)
4355         return 0;
4356     if(total_coeff > (unsigned)max_coeff) {
4357         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4358         return -1;
4359     }
4360
4361     trailing_ones= coeff_token&3;
4362     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4363     assert(total_coeff<=16);
4364
4365     for(i=0; i<trailing_ones; i++){
4366         level[i]= 1 - 2*get_bits1(gb);
4367     }
4368
4369     if(i<total_coeff) {
4370         int level_code, mask;
4371         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4372         int prefix= get_level_prefix(gb);
4373
4374         //first coefficient has suffix_length equal to 0 or 1
4375         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4376             if(suffix_length)
4377                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4378             else
4379                 level_code= (prefix<<suffix_length); //part
4380         }else if(prefix==14){
4381             if(suffix_length)
4382                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4383             else
4384                 level_code= prefix + get_bits(gb, 4); //part
4385         }else{
4386             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4387             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4388             if(prefix>=16)
4389                 level_code += (1<<(prefix-3))-4096;
4390         }
4391
4392         if(trailing_ones < 3) level_code += 2;
4393
4394         suffix_length = 1;
4395         if(level_code > 5)
4396             suffix_length++;
4397         mask= -(level_code&1);
4398         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4399         i++;
4400
4401         //remaining coefficients have suffix_length > 0
4402         for(;i<total_coeff;i++) {
4403             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4404             prefix = get_level_prefix(gb);
4405             if(prefix<15){
4406                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4407             }else{
4408                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4409                 if(prefix>=16)
4410                     level_code += (1<<(prefix-3))-4096;
4411             }
4412             mask= -(level_code&1);
4413             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4414             if(level_code > suffix_limit[suffix_length])
4415                 suffix_length++;
4416         }
4417     }
4418
4419     if(total_coeff == max_coeff)
4420         zeros_left=0;
4421     else{
4422         if(n == CHROMA_DC_BLOCK_INDEX)
4423             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4424         else
4425             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4426     }
4427
4428     coeff_num = zeros_left + total_coeff - 1;
4429     j = scantable[coeff_num];
4430     if(n > 24){
4431         block[j] = level[0];
4432         for(i=1;i<total_coeff;i++) {
4433             if(zeros_left <= 0)
4434                 run_before = 0;
4435             else if(zeros_left < 7){
4436                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4437             }else{
4438                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4439             }
4440             zeros_left -= run_before;
4441             coeff_num -= 1 + run_before;
4442             j= scantable[ coeff_num ];
4443
4444             block[j]= level[i];
4445         }
4446     }else{
4447         block[j] = (level[0] * qmul[j] + 32)>>6;
4448         for(i=1;i<total_coeff;i++) {
4449             if(zeros_left <= 0)
4450                 run_before = 0;
4451             else if(zeros_left < 7){
4452                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4453             }else{
4454                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4455             }
4456             zeros_left -= run_before;
4457             coeff_num -= 1 + run_before;
4458             j= scantable[ coeff_num ];
4459
4460             block[j]= (level[i] * qmul[j] + 32)>>6;
4461         }
4462     }
4463
4464     if(zeros_left<0){
4465         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4466         return -1;
4467     }
4468
4469     return 0;
4470 }
4471
4472 static void predict_field_decoding_flag(H264Context *h){
4473     MpegEncContext * const s = &h->s;
4474     const int mb_xy= h->mb_xy;
4475     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4476                 ? s->current_picture.mb_type[mb_xy-1]
4477                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4478                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4479                 : 0;
4480     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4481 }
4482
4483 /**
4484  * decodes a P_SKIP or B_SKIP macroblock
4485  */
4486 static void decode_mb_skip(H264Context *h){
4487     MpegEncContext * const s = &h->s;
4488     const int mb_xy= h->mb_xy;
4489     int mb_type=0;
4490
4491     memset(h->non_zero_count[mb_xy], 0, 16);
4492     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4493
4494     if(MB_FIELD)
4495         mb_type|= MB_TYPE_INTERLACED;
4496
4497     if( h->slice_type_nos == FF_B_TYPE )
4498     {
4499         // just for fill_caches. pred_direct_motion will set the real mb_type
4500         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4501
4502         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4503         pred_direct_motion(h, &mb_type);
4504         mb_type|= MB_TYPE_SKIP;
4505     }
4506     else
4507     {
4508         int mx, my;
4509         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4510
4511         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4512         pred_pskip_motion(h, &mx, &my);
4513         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4514         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4515     }
4516
4517     write_back_motion(h, mb_type);
4518     s->current_picture.mb_type[mb_xy]= mb_type;
4519     s->current_picture.qscale_table[mb_xy]= s->qscale;
4520     h->slice_table[ mb_xy ]= h->slice_num;
4521     h->prev_mb_skipped= 1;
4522 }
4523
4524 /**
4525  * decodes a macroblock
4526  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4527  */
4528 static int decode_mb_cavlc(H264Context *h){
4529     MpegEncContext * const s = &h->s;
4530     int mb_xy;
4531     int partition_count;
4532     unsigned int mb_type, cbp;
4533     int dct8x8_allowed= h->pps.transform_8x8_mode;
4534
4535     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4536
4537     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4538
4539     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4540     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4541                 down the code */
4542     if(h->slice_type_nos != FF_I_TYPE){
4543         if(s->mb_skip_run==-1)
4544             s->mb_skip_run= get_ue_golomb(&s->gb);
4545
4546         if (s->mb_skip_run--) {
4547             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4548                 if(s->mb_skip_run==0)
4549                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4550                 else
4551                     predict_field_decoding_flag(h);
4552             }
4553             decode_mb_skip(h);
4554             return 0;
4555         }
4556     }
4557     if(FRAME_MBAFF){
4558         if( (s->mb_y&1) == 0 )
4559             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4560     }else
4561         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4562
4563     h->prev_mb_skipped= 0;
4564
4565     mb_type= get_ue_golomb(&s->gb);
4566     if(h->slice_type_nos == FF_B_TYPE){
4567         if(mb_type < 23){
4568             partition_count= b_mb_type_info[mb_type].partition_count;
4569             mb_type=         b_mb_type_info[mb_type].type;
4570         }else{
4571             mb_type -= 23;
4572             goto decode_intra_mb;
4573         }
4574     }else if(h->slice_type_nos == FF_P_TYPE){
4575         if(mb_type < 5){
4576             partition_count= p_mb_type_info[mb_type].partition_count;
4577             mb_type=         p_mb_type_info[mb_type].type;
4578         }else{
4579             mb_type -= 5;
4580             goto decode_intra_mb;
4581         }
4582     }else{
4583        assert(h->slice_type_nos == FF_I_TYPE);
4584         if(h->slice_type == FF_SI_TYPE && mb_type)
4585             mb_type--;
4586 decode_intra_mb:
4587         if(mb_type > 25){
4588             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4589             return -1;
4590         }
4591         partition_count=0;
4592         cbp= i_mb_type_info[mb_type].cbp;
4593         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4594         mb_type= i_mb_type_info[mb_type].type;
4595     }
4596
4597     if(MB_FIELD)
4598         mb_type |= MB_TYPE_INTERLACED;
4599
4600     h->slice_table[ mb_xy ]= h->slice_num;
4601
4602     if(IS_INTRA_PCM(mb_type)){
4603         unsigned int x, y;
4604
4605         // We assume these blocks are very rare so we do not optimize it.
4606         align_get_bits(&s->gb);
4607
4608         // The pixels are stored in the same order as levels in h->mb array.
4609         for(y=0; y<16; y++){
4610             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4611             for(x=0; x<16; x++){
4612                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4613                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4614             }
4615         }
4616         for(y=0; y<8; y++){
4617             const int index= 256 + 4*(y&3) + 32*(y>>2);
4618             for(x=0; x<8; x++){
4619                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4620                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4621             }
4622         }
4623         for(y=0; y<8; y++){
4624             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4625             for(x=0; x<8; x++){
4626                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4627                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4628             }
4629         }
4630
4631         // In deblocking, the quantizer is 0
4632         s->current_picture.qscale_table[mb_xy]= 0;
4633         // All coeffs are present
4634         memset(h->non_zero_count[mb_xy], 16, 16);
4635
4636         s->current_picture.mb_type[mb_xy]= mb_type;
4637         return 0;
4638     }
4639
4640     if(MB_MBAFF){
4641         h->ref_count[0] <<= 1;
4642         h->ref_count[1] <<= 1;
4643     }
4644
4645     fill_caches(h, mb_type, 0);
4646
4647     //mb_pred
4648     if(IS_INTRA(mb_type)){
4649             int pred_mode;
4650 //            init_top_left_availability(h);
4651             if(IS_INTRA4x4(mb_type)){
4652                 int i;
4653                 int di = 1;
4654                 if(dct8x8_allowed && get_bits1(&s->gb)){
4655                     mb_type |= MB_TYPE_8x8DCT;
4656                     di = 4;
4657                 }
4658
4659 //                fill_intra4x4_pred_table(h);
4660                 for(i=0; i<16; i+=di){
4661                     int mode= pred_intra_mode(h, i);
4662
4663                     if(!get_bits1(&s->gb)){
4664                         const int rem_mode= get_bits(&s->gb, 3);
4665                         mode = rem_mode + (rem_mode >= mode);
4666                     }
4667
4668                     if(di==4)
4669                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4670                     else
4671                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4672                 }
4673                 write_back_intra_pred_mode(h);
4674                 if( check_intra4x4_pred_mode(h) < 0)
4675                     return -1;
4676             }else{
4677                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4678                 if(h->intra16x16_pred_mode < 0)
4679                     return -1;
4680             }
4681
4682             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4683             if(pred_mode < 0)
4684                 return -1;
4685             h->chroma_pred_mode= pred_mode;
4686     }else if(partition_count==4){
4687         int i, j, sub_partition_count[4], list, ref[2][4];
4688
4689         if(h->slice_type_nos == FF_B_TYPE){
4690             for(i=0; i<4; i++){
4691                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4692                 if(h->sub_mb_type[i] >=13){
4693                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4694                     return -1;
4695                 }
4696                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4697                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4698             }
4699             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4700                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4701                 pred_direct_motion(h, &mb_type);
4702                 h->ref_cache[0][scan8[4]] =
4703                 h->ref_cache[1][scan8[4]] =
4704                 h->ref_cache[0][scan8[12]] =
4705                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4706             }
4707         }else{
4708             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4709             for(i=0; i<4; i++){
4710                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4711                 if(h->sub_mb_type[i] >=4){
4712                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4713                     return -1;
4714                 }
4715                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4716                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4717             }
4718         }
4719
4720         for(list=0; list<h->list_count; list++){
4721             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4722             for(i=0; i<4; i++){
4723                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4724                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4725                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4726                     if(tmp>=ref_count){
4727                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4728                         return -1;
4729                     }
4730                     ref[list][i]= tmp;
4731                 }else{
4732                  //FIXME
4733                     ref[list][i] = -1;
4734                 }
4735             }
4736         }
4737
4738         if(dct8x8_allowed)
4739             dct8x8_allowed = get_dct8x8_allowed(h);
4740
4741         for(list=0; list<h->list_count; list++){
4742             for(i=0; i<4; i++){
4743                 if(IS_DIRECT(h->sub_mb_type[i])) {
4744                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4745                     continue;
4746                 }
4747                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4748                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4749
4750                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4751                     const int sub_mb_type= h->sub_mb_type[i];
4752                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4753                     for(j=0; j<sub_partition_count[i]; j++){
4754                         int mx, my;
4755                         const int index= 4*i + block_width*j;
4756                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4757                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4758                         mx += get_se_golomb(&s->gb);
4759                         my += get_se_golomb(&s->gb);
4760                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4761
4762                         if(IS_SUB_8X8(sub_mb_type)){
4763                             mv_cache[ 1 ][0]=
4764                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4765                             mv_cache[ 1 ][1]=
4766                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4767                         }else if(IS_SUB_8X4(sub_mb_type)){
4768                             mv_cache[ 1 ][0]= mx;
4769                             mv_cache[ 1 ][1]= my;
4770                         }else if(IS_SUB_4X8(sub_mb_type)){
4771                             mv_cache[ 8 ][0]= mx;
4772                             mv_cache[ 8 ][1]= my;
4773                         }
4774                         mv_cache[ 0 ][0]= mx;
4775                         mv_cache[ 0 ][1]= my;
4776                     }
4777                 }else{
4778                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4779                     p[0] = p[1]=
4780                     p[8] = p[9]= 0;
4781                 }
4782             }
4783         }
4784     }else if(IS_DIRECT(mb_type)){
4785         pred_direct_motion(h, &mb_type);
4786         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4787     }else{
4788         int list, mx, my, i;
4789          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4790         if(IS_16X16(mb_type)){
4791             for(list=0; list<h->list_count; list++){
4792                     unsigned int val;
4793                     if(IS_DIR(mb_type, 0, list)){
4794                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4795                         if(val >= h->ref_count[list]){
4796                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4797                             return -1;
4798                         }
4799                     }else
4800                         val= LIST_NOT_USED&0xFF;
4801                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4802             }
4803             for(list=0; list<h->list_count; list++){
4804                 unsigned int val;
4805                 if(IS_DIR(mb_type, 0, list)){
4806                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4807                     mx += get_se_golomb(&s->gb);
4808                     my += get_se_golomb(&s->gb);
4809                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4810
4811                     val= pack16to32(mx,my);
4812                 }else
4813                     val=0;
4814                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4815             }
4816         }
4817         else if(IS_16X8(mb_type)){
4818             for(list=0; list<h->list_count; list++){
4819                     for(i=0; i<2; i++){
4820                         unsigned int val;
4821                         if(IS_DIR(mb_type, i, list)){
4822                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4823                             if(val >= h->ref_count[list]){
4824                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4825                                 return -1;
4826                             }
4827                         }else
4828                             val= LIST_NOT_USED&0xFF;
4829                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4830                     }
4831             }
4832             for(list=0; list<h->list_count; list++){
4833                 for(i=0; i<2; i++){
4834                     unsigned int val;
4835                     if(IS_DIR(mb_type, i, list)){
4836                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4837                         mx += get_se_golomb(&s->gb);
4838                         my += get_se_golomb(&s->gb);
4839                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4840
4841                         val= pack16to32(mx,my);
4842                     }else
4843                         val=0;
4844                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4845                 }
4846             }
4847         }else{
4848             assert(IS_8X16(mb_type));
4849             for(list=0; list<h->list_count; list++){
4850                     for(i=0; i<2; i++){
4851                         unsigned int val;
4852                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4853                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4854                             if(val >= h->ref_count[list]){
4855                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4856                                 return -1;
4857                             }
4858                         }else
4859                             val= LIST_NOT_USED&0xFF;
4860                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4861                     }
4862             }
4863             for(list=0; list<h->list_count; list++){
4864                 for(i=0; i<2; i++){
4865                     unsigned int val;
4866                     if(IS_DIR(mb_type, i, list)){
4867                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4868                         mx += get_se_golomb(&s->gb);
4869                         my += get_se_golomb(&s->gb);
4870                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4871
4872                         val= pack16to32(mx,my);
4873                     }else
4874                         val=0;
4875                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4876                 }
4877             }
4878         }
4879     }
4880
4881     if(IS_INTER(mb_type))
4882         write_back_motion(h, mb_type);
4883
4884     if(!IS_INTRA16x16(mb_type)){
4885         cbp= get_ue_golomb(&s->gb);
4886         if(cbp > 47){
4887             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4888             return -1;
4889         }
4890
4891         if(IS_INTRA4x4(mb_type))
4892             cbp= golomb_to_intra4x4_cbp[cbp];
4893         else
4894             cbp= golomb_to_inter_cbp[cbp];
4895     }
4896     h->cbp = cbp;
4897
4898     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4899         if(get_bits1(&s->gb))
4900             mb_type |= MB_TYPE_8x8DCT;
4901     }
4902     s->current_picture.mb_type[mb_xy]= mb_type;
4903
4904     if(cbp || IS_INTRA16x16(mb_type)){
4905         int i8x8, i4x4, chroma_idx;
4906         int dquant;
4907         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4908         const uint8_t *scan, *scan8x8, *dc_scan;
4909
4910 //        fill_non_zero_count_cache(h);
4911
4912         if(IS_INTERLACED(mb_type)){
4913             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4914             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4915             dc_scan= luma_dc_field_scan;
4916         }else{
4917             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4918             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4919             dc_scan= luma_dc_zigzag_scan;
4920         }
4921
4922         dquant= get_se_golomb(&s->gb);
4923
4924         if( dquant > 25 || dquant < -26 ){
4925             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4926             return -1;
4927         }
4928
4929         s->qscale += dquant;
4930         if(((unsigned)s->qscale) > 51){
4931             if(s->qscale<0) s->qscale+= 52;
4932             else            s->qscale-= 52;
4933         }
4934
4935         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4936         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4937         if(IS_INTRA16x16(mb_type)){
4938             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4939                 return -1; //FIXME continue if partitioned and other return -1 too
4940             }
4941
4942             assert((cbp&15) == 0 || (cbp&15) == 15);
4943
4944             if(cbp&15){
4945                 for(i8x8=0; i8x8<4; i8x8++){
4946                     for(i4x4=0; i4x4<4; i4x4++){
4947                         const int index= i4x4 + 4*i8x8;
4948                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4949                             return -1;
4950                         }
4951                     }
4952                 }
4953             }else{
4954                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4955             }
4956         }else{
4957             for(i8x8=0; i8x8<4; i8x8++){
4958                 if(cbp & (1<<i8x8)){
4959                     if(IS_8x8DCT(mb_type)){
4960                         DCTELEM *buf = &h->mb[64*i8x8];
4961                         uint8_t *nnz;
4962                         for(i4x4=0; i4x4<4; i4x4++){
4963                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4964                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4965                                 return -1;
4966                         }
4967                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4968                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4969                     }else{
4970                         for(i4x4=0; i4x4<4; i4x4++){
4971                             const int index= i4x4 + 4*i8x8;
4972
4973                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4974                                 return -1;
4975                             }
4976                         }
4977                     }
4978                 }else{
4979                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4980                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4981                 }
4982             }
4983         }
4984
4985         if(cbp&0x30){
4986             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4987                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4988                     return -1;
4989                 }
4990         }
4991
4992         if(cbp&0x20){
4993             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4994                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4995                 for(i4x4=0; i4x4<4; i4x4++){
4996                     const int index= 16 + 4*chroma_idx + i4x4;
4997                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4998                         return -1;
4999                     }
5000                 }
5001             }
5002         }else{
5003             uint8_t * const nnz= &h->non_zero_count_cache[0];
5004             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5005             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5006         }
5007     }else{
5008         uint8_t * const nnz= &h->non_zero_count_cache[0];
5009         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5010         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5011         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5012     }
5013     s->current_picture.qscale_table[mb_xy]= s->qscale;
5014     write_back_non_zero_count(h);
5015
5016     if(MB_MBAFF){
5017         h->ref_count[0] >>= 1;
5018         h->ref_count[1] >>= 1;
5019     }
5020
5021     return 0;
5022 }
5023
5024 static int decode_cabac_field_decoding_flag(H264Context *h) {
5025     MpegEncContext * const s = &h->s;
5026     const int mb_x = s->mb_x;
5027     const int mb_y = s->mb_y & ~1;
5028     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5029     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5030
5031     unsigned int ctx = 0;
5032
5033     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5034         ctx += 1;
5035     }
5036     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5037         ctx += 1;
5038     }
5039
5040     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5041 }
5042
5043 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5044     uint8_t *state= &h->cabac_state[ctx_base];
5045     int mb_type;
5046
5047     if(intra_slice){
5048         MpegEncContext * const s = &h->s;
5049         const int mba_xy = h->left_mb_xy[0];
5050         const int mbb_xy = h->top_mb_xy;
5051         int ctx=0;
5052         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5053             ctx++;
5054         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5055             ctx++;
5056         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5057             return 0;   /* I4x4 */
5058         state += 2;
5059     }else{
5060         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5061             return 0;   /* I4x4 */
5062     }
5063
5064     if( get_cabac_terminate( &h->cabac ) )
5065         return 25;  /* PCM */
5066
5067     mb_type = 1; /* I16x16 */
5068     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5069     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5070         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5071     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5072     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5073     return mb_type;
5074 }
5075
5076 static int decode_cabac_mb_type( H264Context *h ) {
5077     MpegEncContext * const s = &h->s;
5078
5079     if( h->slice_type_nos == FF_I_TYPE ) {
5080         return decode_cabac_intra_mb_type(h, 3, 1);
5081     } else if( h->slice_type_nos == FF_P_TYPE ) {
5082         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5083             /* P-type */
5084             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5085                 /* P_L0_D16x16, P_8x8 */
5086                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5087             } else {
5088                 /* P_L0_D8x16, P_L0_D16x8 */
5089                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5090             }
5091         } else {
5092             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5093         }
5094     } else if( h->slice_type_nos == FF_B_TYPE ) {
5095         const int mba_xy = h->left_mb_xy[0];
5096         const int mbb_xy = h->top_mb_xy;
5097         int ctx = 0;
5098         int bits;
5099
5100         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5101             ctx++;
5102         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5103             ctx++;
5104
5105         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5106             return 0; /* B_Direct_16x16 */
5107
5108         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5109             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5110         }
5111
5112         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5113         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5114         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5115         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5116         if( bits < 8 )
5117             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5118         else if( bits == 13 ) {
5119             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5120         } else if( bits == 14 )
5121             return 11; /* B_L1_L0_8x16 */
5122         else if( bits == 15 )
5123             return 22; /* B_8x8 */
5124
5125         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5126         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5127     } else {
5128         /* TODO SI/SP frames? */
5129         return -1;
5130     }
5131 }
5132
5133 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5134     MpegEncContext * const s = &h->s;
5135     int mba_xy, mbb_xy;
5136     int ctx = 0;
5137
5138     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5139         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5140         mba_xy = mb_xy - 1;
5141         if( (mb_y&1)
5142             && h->slice_table[mba_xy] == h->slice_num
5143             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5144             mba_xy += s->mb_stride;
5145         if( MB_FIELD ){
5146             mbb_xy = mb_xy - s->mb_stride;
5147             if( !(mb_y&1)
5148                 && h->slice_table[mbb_xy] == h->slice_num
5149                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5150                 mbb_xy -= s->mb_stride;
5151         }else
5152             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5153     }else{
5154         int mb_xy = h->mb_xy;
5155         mba_xy = mb_xy - 1;
5156         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5157     }
5158
5159     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5160         ctx++;
5161     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5162         ctx++;
5163
5164     if( h->slice_type_nos == FF_B_TYPE )
5165         ctx += 13;
5166     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5167 }
5168
5169 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5170     int mode = 0;
5171
5172     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5173         return pred_mode;
5174
5175     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5176     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5177     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5178
5179     if( mode >= pred_mode )
5180         return mode + 1;
5181     else
5182         return mode;
5183 }
5184
5185 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5186     const int mba_xy = h->left_mb_xy[0];
5187     const int mbb_xy = h->top_mb_xy;
5188
5189     int ctx = 0;
5190
5191     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5192     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5193         ctx++;
5194
5195     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5196         ctx++;
5197
5198     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5199         return 0;
5200
5201     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5202         return 1;
5203     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5204         return 2;
5205     else
5206         return 3;
5207 }
5208
5209 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5210     int cbp_b, cbp_a, ctx, cbp = 0;
5211
5212     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5213     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5214
5215     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5216     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5217     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5218     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5219     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5220     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5221     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5222     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5223     return cbp;
5224 }
5225 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5226     int ctx;
5227     int cbp_a, cbp_b;
5228
5229     cbp_a = (h->left_cbp>>4)&0x03;
5230     cbp_b = (h-> top_cbp>>4)&0x03;
5231
5232     ctx = 0;
5233     if( cbp_a > 0 ) ctx++;
5234     if( cbp_b > 0 ) ctx += 2;
5235     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5236         return 0;
5237
5238     ctx = 4;
5239     if( cbp_a == 2 ) ctx++;
5240     if( cbp_b == 2 ) ctx += 2;
5241     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5242 }
5243 static int decode_cabac_mb_dqp( H264Context *h) {
5244     int   ctx = 0;
5245     int   val = 0;
5246
5247     if( h->last_qscale_diff != 0 )
5248         ctx++;
5249
5250     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5251         if( ctx < 2 )
5252             ctx = 2;
5253         else
5254             ctx = 3;
5255         val++;
5256         if(val > 102) //prevent infinite loop
5257             return INT_MIN;
5258     }
5259
5260     if( val&0x01 )
5261         return (val + 1)/2;
5262     else
5263         return -(val + 1)/2;
5264 }
5265 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5266     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5267         return 0;   /* 8x8 */
5268     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5269         return 1;   /* 8x4 */
5270     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5271         return 2;   /* 4x8 */
5272     return 3;       /* 4x4 */
5273 }
5274 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5275     int type;
5276     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5277         return 0;   /* B_Direct_8x8 */
5278     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5279         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5280     type = 3;
5281     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5282         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5283             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5284         type += 4;
5285     }
5286     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5287     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5288     return type;
5289 }
5290
5291 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5292     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5293 }
5294
5295 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5296     int refa = h->ref_cache[list][scan8[n] - 1];
5297     int refb = h->ref_cache[list][scan8[n] - 8];
5298     int ref  = 0;
5299     int ctx  = 0;
5300
5301     if( h->slice_type_nos == FF_B_TYPE) {
5302         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5303             ctx++;
5304         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5305             ctx += 2;
5306     } else {
5307         if( refa > 0 )
5308             ctx++;
5309         if( refb > 0 )
5310             ctx += 2;
5311     }
5312
5313     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5314         ref++;
5315         if( ctx < 4 )
5316             ctx = 4;
5317         else
5318             ctx = 5;
5319         if(ref >= 32 /*h->ref_list[list]*/){
5320             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5321             return 0; //FIXME we should return -1 and check the return everywhere
5322         }
5323     }
5324     return ref;
5325 }
5326
5327 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5328     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5329                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5330     int ctxbase = (l == 0) ? 40 : 47;
5331     int ctx, mvd;
5332
5333     if( amvd < 3 )
5334         ctx = 0;
5335     else if( amvd > 32 )
5336         ctx = 2;
5337     else
5338         ctx = 1;
5339
5340     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5341         return 0;
5342
5343     mvd= 1;
5344     ctx= 3;
5345     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5346         mvd++;
5347         if( ctx < 6 )
5348             ctx++;
5349     }
5350
5351     if( mvd >= 9 ) {
5352         int k = 3;
5353         while( get_cabac_bypass( &h->cabac ) ) {
5354             mvd += 1 << k;
5355             k++;
5356             if(k>24){
5357                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5358                 return INT_MIN;
5359             }
5360         }
5361         while( k-- ) {
5362             if( get_cabac_bypass( &h->cabac ) )
5363                 mvd += 1 << k;
5364         }
5365     }
5366     return get_cabac_bypass_sign( &h->cabac, -mvd );
5367 }
5368
5369 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5370     int nza, nzb;
5371     int ctx = 0;
5372
5373     if( is_dc ) {
5374         if( cat == 0 ) {
5375             nza = h->left_cbp&0x100;
5376             nzb = h-> top_cbp&0x100;
5377         } else {
5378             nza = (h->left_cbp>>(6+idx))&0x01;
5379             nzb = (h-> top_cbp>>(6+idx))&0x01;
5380         }
5381     } else {
5382         if( cat == 4 ) {
5383             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5384             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5385         } else {
5386             assert(cat == 1 || cat == 2);
5387             nza = h->non_zero_count_cache[scan8[idx] - 1];
5388             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5389         }
5390     }
5391
5392     if( nza > 0 )
5393         ctx++;
5394
5395     if( nzb > 0 )
5396         ctx += 2;
5397
5398     return ctx + 4 * cat;
5399 }
5400
5401 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5402     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5403     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5404     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5405     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5406 };
5407
5408 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5409     static const int significant_coeff_flag_offset[2][6] = {
5410       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5411       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5412     };
5413     static const int last_coeff_flag_offset[2][6] = {
5414       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5415       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5416     };
5417     static const int coeff_abs_level_m1_offset[6] = {
5418         227+0, 227+10, 227+20, 227+30, 227+39, 426
5419     };
5420     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5421       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5422         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5423         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5424        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5425       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5426         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5427         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5428         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5429     };
5430     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5431      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5432      * map node ctx => cabac ctx for level=1 */
5433     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5434     /* map node ctx => cabac ctx for level>1 */
5435     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5436     static const uint8_t coeff_abs_level_transition[2][8] = {
5437     /* update node ctx after decoding a level=1 */
5438         { 1, 2, 3, 3, 4, 5, 6, 7 },
5439     /* update node ctx after decoding a level>1 */
5440         { 4, 4, 4, 4, 5, 6, 7, 7 }
5441     };
5442
5443     int index[64];
5444
5445     int av_unused last;
5446     int coeff_count = 0;
5447     int node_ctx = 0;
5448
5449     uint8_t *significant_coeff_ctx_base;
5450     uint8_t *last_coeff_ctx_base;
5451     uint8_t *abs_level_m1_ctx_base;
5452
5453 #ifndef ARCH_X86
5454 #define CABAC_ON_STACK
5455 #endif
5456 #ifdef CABAC_ON_STACK
5457 #define CC &cc
5458     CABACContext cc;
5459     cc.range     = h->cabac.range;
5460     cc.low       = h->cabac.low;
5461     cc.bytestream= h->cabac.bytestream;
5462 #else
5463 #define CC &h->cabac
5464 #endif
5465
5466
5467     /* cat: 0-> DC 16x16  n = 0
5468      *      1-> AC 16x16  n = luma4x4idx
5469      *      2-> Luma4x4   n = luma4x4idx
5470      *      3-> DC Chroma n = iCbCr
5471      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5472      *      5-> Luma8x8   n = 4 * luma8x8idx
5473      */
5474
5475     /* read coded block flag */
5476     if( is_dc || cat != 5 ) {
5477         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5478             if( !is_dc ) {
5479                 if( cat == 4 )
5480                     h->non_zero_count_cache[scan8[16+n]] = 0;
5481                 else
5482                     h->non_zero_count_cache[scan8[n]] = 0;
5483             }
5484
5485 #ifdef CABAC_ON_STACK
5486             h->cabac.range     = cc.range     ;
5487             h->cabac.low       = cc.low       ;
5488             h->cabac.bytestream= cc.bytestream;
5489 #endif
5490             return;
5491         }
5492     }
5493
5494     significant_coeff_ctx_base = h->cabac_state
5495         + significant_coeff_flag_offset[MB_FIELD][cat];
5496     last_coeff_ctx_base = h->cabac_state
5497         + last_coeff_flag_offset[MB_FIELD][cat];
5498     abs_level_m1_ctx_base = h->cabac_state
5499         + coeff_abs_level_m1_offset[cat];
5500
5501     if( !is_dc && cat == 5 ) {
5502 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5503         for(last= 0; last < coefs; last++) { \
5504             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5505             if( get_cabac( CC, sig_ctx )) { \
5506                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5507                 index[coeff_count++] = last; \
5508                 if( get_cabac( CC, last_ctx ) ) { \
5509                     last= max_coeff; \
5510                     break; \
5511                 } \
5512             } \
5513         }\
5514         if( last == max_coeff -1 ) {\
5515             index[coeff_count++] = last;\
5516         }
5517         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5518 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5519         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5520     } else {
5521         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5522 #else
5523         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5524     } else {
5525         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5526 #endif
5527     }
5528     assert(coeff_count > 0);
5529
5530     if( is_dc ) {
5531         if( cat == 0 )
5532             h->cbp_table[h->mb_xy] |= 0x100;
5533         else
5534             h->cbp_table[h->mb_xy] |= 0x40 << n;
5535     } else {
5536         if( cat == 5 )
5537             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5538         else if( cat == 4 )
5539             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5540         else {
5541             assert( cat == 1 || cat == 2 );
5542             h->non_zero_count_cache[scan8[n]] = coeff_count;
5543         }
5544     }
5545
5546     while( coeff_count-- ) {
5547         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5548
5549         int j= scantable[index[coeff_count]];
5550
5551         if( get_cabac( CC, ctx ) == 0 ) {
5552             node_ctx = coeff_abs_level_transition[0][node_ctx];
5553             if( is_dc ) {
5554                 block[j] = get_cabac_bypass_sign( CC, -1);
5555             }else{
5556                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5557             }
5558         } else {
5559             int coeff_abs = 2;
5560             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5561             node_ctx = coeff_abs_level_transition[1][node_ctx];
5562
5563             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5564                 coeff_abs++;
5565             }
5566
5567             if( coeff_abs >= 15 ) {
5568                 int j = 0;
5569                 while( get_cabac_bypass( CC ) ) {
5570                     j++;
5571                 }
5572
5573                 coeff_abs=1;
5574                 while( j-- ) {
5575                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5576                 }
5577                 coeff_abs+= 14;
5578             }
5579
5580             if( is_dc ) {
5581                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5582             }else{
5583                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5584             }
5585         }
5586     }
5587 #ifdef CABAC_ON_STACK
5588             h->cabac.range     = cc.range     ;
5589             h->cabac.low       = cc.low       ;
5590             h->cabac.bytestream= cc.bytestream;
5591 #endif
5592
5593 }
5594
5595 #ifndef CONFIG_SMALL
5596 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5597     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5598 }
5599
5600 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5601     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5602 }
5603 #endif
5604
5605 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5606 #ifdef CONFIG_SMALL
5607     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5608 #else
5609     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5610     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5611 #endif
5612 }
5613
5614 static inline void compute_mb_neighbors(H264Context *h)
5615 {
5616     MpegEncContext * const s = &h->s;
5617     const int mb_xy  = h->mb_xy;
5618     h->top_mb_xy     = mb_xy - s->mb_stride;
5619     h->left_mb_xy[0] = mb_xy - 1;
5620     if(FRAME_MBAFF){
5621         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5622         const int top_pair_xy      = pair_xy     - s->mb_stride;
5623         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5624         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5625         const int curr_mb_frame_flag = !MB_FIELD;
5626         const int bottom = (s->mb_y & 1);
5627         if (bottom
5628                 ? !curr_mb_frame_flag // bottom macroblock
5629                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5630                 ) {
5631             h->top_mb_xy -= s->mb_stride;
5632         }
5633         if (left_mb_frame_flag != curr_mb_frame_flag) {
5634             h->left_mb_xy[0] = pair_xy - 1;
5635         }
5636     } else if (FIELD_PICTURE) {
5637         h->top_mb_xy -= s->mb_stride;
5638     }
5639     return;
5640 }
5641
5642 /**
5643  * decodes a macroblock
5644  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5645  */
5646 static int decode_mb_cabac(H264Context *h) {
5647     MpegEncContext * const s = &h->s;
5648     int mb_xy;
5649     int mb_type, partition_count, cbp = 0;
5650     int dct8x8_allowed= h->pps.transform_8x8_mode;
5651
5652     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5653
5654     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5655
5656     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5657     if( h->slice_type_nos != FF_I_TYPE ) {
5658         int skip;
5659         /* a skipped mb needs the aff flag from the following mb */
5660         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5661             predict_field_decoding_flag(h);
5662         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5663             skip = h->next_mb_skipped;
5664         else
5665             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5666         /* read skip flags */
5667         if( skip ) {
5668             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5669                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5670                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5671                 if(h->next_mb_skipped)
5672                     predict_field_decoding_flag(h);
5673                 else
5674                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5675             }
5676
5677             decode_mb_skip(h);
5678
5679             h->cbp_table[mb_xy] = 0;
5680             h->chroma_pred_mode_table[mb_xy] = 0;
5681             h->last_qscale_diff = 0;
5682
5683             return 0;
5684
5685         }
5686     }
5687     if(FRAME_MBAFF){
5688         if( (s->mb_y&1) == 0 )
5689             h->mb_mbaff =
5690             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5691     }else
5692         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5693
5694     h->prev_mb_skipped = 0;
5695
5696     compute_mb_neighbors(h);
5697     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5698         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5699         return -1;
5700     }
5701
5702     if( h->slice_type_nos == FF_B_TYPE ) {
5703         if( mb_type < 23 ){
5704             partition_count= b_mb_type_info[mb_type].partition_count;
5705             mb_type=         b_mb_type_info[mb_type].type;
5706         }else{
5707             mb_type -= 23;
5708             goto decode_intra_mb;
5709         }
5710     } else if( h->slice_type_nos == FF_P_TYPE ) {
5711         if( mb_type < 5) {
5712             partition_count= p_mb_type_info[mb_type].partition_count;
5713             mb_type=         p_mb_type_info[mb_type].type;
5714         } else {
5715             mb_type -= 5;
5716             goto decode_intra_mb;
5717         }
5718     } else {
5719         if(h->slice_type == FF_SI_TYPE && mb_type)
5720             mb_type--;
5721         assert(h->slice_type_nos == FF_I_TYPE);
5722 decode_intra_mb:
5723         partition_count = 0;
5724         cbp= i_mb_type_info[mb_type].cbp;
5725         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5726         mb_type= i_mb_type_info[mb_type].type;
5727     }
5728     if(MB_FIELD)
5729         mb_type |= MB_TYPE_INTERLACED;
5730
5731     h->slice_table[ mb_xy ]= h->slice_num;
5732
5733     if(IS_INTRA_PCM(mb_type)) {
5734         const uint8_t *ptr;
5735         unsigned int x, y;
5736
5737         // We assume these blocks are very rare so we do not optimize it.
5738         // FIXME The two following lines get the bitstream position in the cabac
5739         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5740         ptr= h->cabac.bytestream;
5741         if(h->cabac.low&0x1) ptr--;
5742         if(CABAC_BITS==16){
5743             if(h->cabac.low&0x1FF) ptr--;
5744         }
5745
5746         // The pixels are stored in the same order as levels in h->mb array.
5747         for(y=0; y<16; y++){
5748             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5749             for(x=0; x<16; x++){
5750                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5751                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5752             }
5753         }
5754         for(y=0; y<8; y++){
5755             const int index= 256 + 4*(y&3) + 32*(y>>2);
5756             for(x=0; x<8; x++){
5757                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5758                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5759             }
5760         }
5761         for(y=0; y<8; y++){
5762             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5763             for(x=0; x<8; x++){
5764                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5765                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5766             }
5767         }
5768
5769         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5770
5771         // All blocks are present
5772         h->cbp_table[mb_xy] = 0x1ef;
5773         h->chroma_pred_mode_table[mb_xy] = 0;
5774         // In deblocking, the quantizer is 0
5775         s->current_picture.qscale_table[mb_xy]= 0;
5776         // All coeffs are present
5777         memset(h->non_zero_count[mb_xy], 16, 16);
5778         s->current_picture.mb_type[mb_xy]= mb_type;
5779         h->last_qscale_diff = 0;
5780         return 0;
5781     }
5782
5783     if(MB_MBAFF){
5784         h->ref_count[0] <<= 1;
5785         h->ref_count[1] <<= 1;
5786     }
5787
5788     fill_caches(h, mb_type, 0);
5789
5790     if( IS_INTRA( mb_type ) ) {
5791         int i, pred_mode;
5792         if( IS_INTRA4x4( mb_type ) ) {
5793             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5794                 mb_type |= MB_TYPE_8x8DCT;
5795                 for( i = 0; i < 16; i+=4 ) {
5796                     int pred = pred_intra_mode( h, i );
5797                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5798                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5799                 }
5800             } else {
5801                 for( i = 0; i < 16; i++ ) {
5802                     int pred = pred_intra_mode( h, i );
5803                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5804
5805                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5806                 }
5807             }
5808             write_back_intra_pred_mode(h);
5809             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5810         } else {
5811             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5812             if( h->intra16x16_pred_mode < 0 ) return -1;
5813         }
5814         h->chroma_pred_mode_table[mb_xy] =
5815         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5816
5817         pred_mode= check_intra_pred_mode( h, pred_mode );
5818         if( pred_mode < 0 ) return -1;
5819         h->chroma_pred_mode= pred_mode;
5820     } else if( partition_count == 4 ) {
5821         int i, j, sub_partition_count[4], list, ref[2][4];
5822
5823         if( h->slice_type_nos == FF_B_TYPE ) {
5824             for( i = 0; i < 4; i++ ) {
5825                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5826                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5827                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5828             }
5829             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5830                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5831                 pred_direct_motion(h, &mb_type);
5832                 h->ref_cache[0][scan8[4]] =
5833                 h->ref_cache[1][scan8[4]] =
5834                 h->ref_cache[0][scan8[12]] =
5835                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5836                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5837                     for( i = 0; i < 4; i++ )
5838                         if( IS_DIRECT(h->sub_mb_type[i]) )
5839                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5840                 }
5841             }
5842         } else {
5843             for( i = 0; i < 4; i++ ) {
5844                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5845                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5846                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5847             }
5848         }
5849
5850         for( list = 0; list < h->list_count; list++ ) {
5851                 for( i = 0; i < 4; i++ ) {
5852                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5853                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5854                         if( h->ref_count[list] > 1 )
5855                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5856                         else
5857                             ref[list][i] = 0;
5858                     } else {
5859                         ref[list][i] = -1;
5860                     }
5861                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5862                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5863                 }
5864         }
5865
5866         if(dct8x8_allowed)
5867             dct8x8_allowed = get_dct8x8_allowed(h);
5868
5869         for(list=0; list<h->list_count; list++){
5870             for(i=0; i<4; i++){
5871                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5872                 if(IS_DIRECT(h->sub_mb_type[i])){
5873                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5874                     continue;
5875                 }
5876
5877                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5878                     const int sub_mb_type= h->sub_mb_type[i];
5879                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5880                     for(j=0; j<sub_partition_count[i]; j++){
5881                         int mpx, mpy;
5882                         int mx, my;
5883                         const int index= 4*i + block_width*j;
5884                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5885                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5886                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5887
5888                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5889                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5890                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5891
5892                         if(IS_SUB_8X8(sub_mb_type)){
5893                             mv_cache[ 1 ][0]=
5894                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5895                             mv_cache[ 1 ][1]=
5896                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5897
5898                             mvd_cache[ 1 ][0]=
5899                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5900                             mvd_cache[ 1 ][1]=
5901                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5902                         }else if(IS_SUB_8X4(sub_mb_type)){
5903                             mv_cache[ 1 ][0]= mx;
5904                             mv_cache[ 1 ][1]= my;
5905
5906                             mvd_cache[ 1 ][0]= mx - mpx;
5907                             mvd_cache[ 1 ][1]= my - mpy;
5908                         }else if(IS_SUB_4X8(sub_mb_type)){
5909                             mv_cache[ 8 ][0]= mx;
5910                             mv_cache[ 8 ][1]= my;
5911
5912                             mvd_cache[ 8 ][0]= mx - mpx;
5913                             mvd_cache[ 8 ][1]= my - mpy;
5914                         }
5915                         mv_cache[ 0 ][0]= mx;
5916                         mv_cache[ 0 ][1]= my;
5917
5918                         mvd_cache[ 0 ][0]= mx - mpx;
5919                         mvd_cache[ 0 ][1]= my - mpy;
5920                     }
5921                 }else{
5922                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5923                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5924                     p[0] = p[1] = p[8] = p[9] = 0;
5925                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5926                 }
5927             }
5928         }
5929     } else if( IS_DIRECT(mb_type) ) {
5930         pred_direct_motion(h, &mb_type);
5931         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5932         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5933         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5934     } else {
5935         int list, mx, my, i, mpx, mpy;
5936         if(IS_16X16(mb_type)){
5937             for(list=0; list<h->list_count; list++){
5938                 if(IS_DIR(mb_type, 0, list)){
5939                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5940                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5941                 }else
5942                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5943             }
5944             for(list=0; list<h->list_count; list++){
5945                 if(IS_DIR(mb_type, 0, list)){
5946                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5947
5948                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5949                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5950                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5951
5952                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5953                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5954                 }else
5955                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5956             }
5957         }
5958         else if(IS_16X8(mb_type)){
5959             for(list=0; list<h->list_count; list++){
5960                     for(i=0; i<2; i++){
5961                         if(IS_DIR(mb_type, i, list)){
5962                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5963                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5964                         }else
5965                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5966                     }
5967             }
5968             for(list=0; list<h->list_count; list++){
5969                 for(i=0; i<2; i++){
5970                     if(IS_DIR(mb_type, i, list)){
5971                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5972                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5973                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5974                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5975
5976                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5977                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5978                     }else{
5979                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5980                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5981                     }
5982                 }
5983             }
5984         }else{
5985             assert(IS_8X16(mb_type));
5986             for(list=0; list<h->list_count; list++){
5987                     for(i=0; i<2; i++){
5988                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5989                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5990                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5991                         }else
5992                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5993                     }
5994             }
5995             for(list=0; list<h->list_count; list++){
5996                 for(i=0; i<2; i++){
5997                     if(IS_DIR(mb_type, i, list)){
5998                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5999                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6000                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6001
6002                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6003                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6004                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6005                     }else{
6006                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6007                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6008                     }
6009                 }
6010             }
6011         }
6012     }
6013
6014    if( IS_INTER( mb_type ) ) {
6015         h->chroma_pred_mode_table[mb_xy] = 0;
6016         write_back_motion( h, mb_type );
6017    }
6018
6019     if( !IS_INTRA16x16( mb_type ) ) {
6020         cbp  = decode_cabac_mb_cbp_luma( h );
6021         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6022     }
6023
6024     h->cbp_table[mb_xy] = h->cbp = cbp;
6025
6026     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6027         if( decode_cabac_mb_transform_size( h ) )
6028             mb_type |= MB_TYPE_8x8DCT;
6029     }
6030     s->current_picture.mb_type[mb_xy]= mb_type;
6031
6032     if( cbp || IS_INTRA16x16( mb_type ) ) {
6033         const uint8_t *scan, *scan8x8, *dc_scan;
6034         const uint32_t *qmul;
6035         int dqp;
6036
6037         if(IS_INTERLACED(mb_type)){
6038             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6039             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6040             dc_scan= luma_dc_field_scan;
6041         }else{
6042             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6043             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6044             dc_scan= luma_dc_zigzag_scan;
6045         }
6046
6047         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6048         if( dqp == INT_MIN ){
6049             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6050             return -1;
6051         }
6052         s->qscale += dqp;
6053         if(((unsigned)s->qscale) > 51){
6054             if(s->qscale<0) s->qscale+= 52;
6055             else            s->qscale-= 52;
6056         }
6057         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6058         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6059
6060         if( IS_INTRA16x16( mb_type ) ) {
6061             int i;
6062             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6063             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6064
6065             if( cbp&15 ) {
6066                 qmul = h->dequant4_coeff[0][s->qscale];
6067                 for( i = 0; i < 16; i++ ) {
6068                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6069                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6070                 }
6071             } else {
6072                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6073             }
6074         } else {
6075             int i8x8, i4x4;
6076             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6077                 if( cbp & (1<<i8x8) ) {
6078                     if( IS_8x8DCT(mb_type) ) {
6079                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6080                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6081                     } else {
6082                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6083                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6084                             const int index = 4*i8x8 + i4x4;
6085                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6086 //START_TIMER
6087                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6088 //STOP_TIMER("decode_residual")
6089                         }
6090                     }
6091                 } else {
6092                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6093                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6094                 }
6095             }
6096         }
6097
6098         if( cbp&0x30 ){
6099             int c;
6100             for( c = 0; c < 2; c++ ) {
6101                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6102                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6103             }
6104         }
6105
6106         if( cbp&0x20 ) {
6107             int c, i;
6108             for( c = 0; c < 2; c++ ) {
6109                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6110                 for( i = 0; i < 4; i++ ) {
6111                     const int index = 16 + 4 * c + i;
6112                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6113                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6114                 }
6115             }
6116         } else {
6117             uint8_t * const nnz= &h->non_zero_count_cache[0];
6118             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6119             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6120         }
6121     } else {
6122         uint8_t * const nnz= &h->non_zero_count_cache[0];
6123         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6124         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6125         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6126         h->last_qscale_diff = 0;
6127     }
6128
6129     s->current_picture.qscale_table[mb_xy]= s->qscale;
6130     write_back_non_zero_count(h);
6131
6132     if(MB_MBAFF){
6133         h->ref_count[0] >>= 1;
6134         h->ref_count[1] >>= 1;
6135     }
6136
6137     return 0;
6138 }
6139
6140
6141 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6142     int i, d;
6143     const int index_a = qp + h->slice_alpha_c0_offset;
6144     const int alpha = (alpha_table+52)[index_a];
6145     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6146
6147     if( bS[0] < 4 ) {
6148         int8_t tc[4];
6149         for(i=0; i<4; i++)
6150             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6151         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6152     } else {
6153         /* 16px edge length, because bS=4 is triggered by being at
6154          * the edge of an intra MB, so all 4 bS are the same */
6155             for( d = 0; d < 16; d++ ) {
6156                 const int p0 = pix[-1];
6157                 const int p1 = pix[-2];
6158                 const int p2 = pix[-3];
6159
6160                 const int q0 = pix[0];
6161                 const int q1 = pix[1];
6162                 const int q2 = pix[2];
6163
6164                 if( FFABS( p0 - q0 ) < alpha &&
6165                     FFABS( p1 - p0 ) < beta &&
6166                     FFABS( q1 - q0 ) < beta ) {
6167
6168                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6169                         if( FFABS( p2 - p0 ) < beta)
6170                         {
6171                             const int p3 = pix[-4];
6172                             /* p0', p1', p2' */
6173                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6174                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6175                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6176                         } else {
6177                             /* p0' */
6178                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6179                         }
6180                         if( FFABS( q2 - q0 ) < beta)
6181                         {
6182                             const int q3 = pix[3];
6183                             /* q0', q1', q2' */
6184                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6185                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6186                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6187                         } else {
6188                             /* q0' */
6189                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6190                         }
6191                     }else{
6192                         /* p0', q0' */
6193                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6194                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6195                     }
6196                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6197                 }
6198                 pix += stride;
6199             }
6200     }
6201 }
6202 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6203     int i;
6204     const int index_a = qp + h->slice_alpha_c0_offset;
6205     const int alpha = (alpha_table+52)[index_a];
6206     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6207
6208     if( bS[0] < 4 ) {
6209         int8_t tc[4];
6210         for(i=0; i<4; i++)
6211             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6212         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6213     } else {
6214         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6215     }
6216 }
6217
6218 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6219     int i;
6220     for( i = 0; i < 16; i++, pix += stride) {
6221         int index_a;
6222         int alpha;
6223         int beta;
6224
6225         int qp_index;
6226         int bS_index = (i >> 1);
6227         if (!MB_FIELD) {
6228             bS_index &= ~1;
6229             bS_index |= (i & 1);
6230         }
6231
6232         if( bS[bS_index] == 0 ) {
6233             continue;
6234         }
6235
6236         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6237         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6238         alpha = (alpha_table+52)[index_a];
6239         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6240
6241         if( bS[bS_index] < 4 ) {
6242             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6243             const int p0 = pix[-1];
6244             const int p1 = pix[-2];
6245             const int p2 = pix[-3];
6246             const int q0 = pix[0];
6247             const int q1 = pix[1];
6248             const int q2 = pix[2];
6249
6250             if( FFABS( p0 - q0 ) < alpha &&
6251                 FFABS( p1 - p0 ) < beta &&
6252                 FFABS( q1 - q0 ) < beta ) {
6253                 int tc = tc0;
6254                 int i_delta;
6255
6256                 if( FFABS( p2 - p0 ) < beta ) {
6257                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6258                     tc++;
6259                 }
6260                 if( FFABS( q2 - q0 ) < beta ) {
6261                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6262                     tc++;
6263                 }
6264
6265                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6266                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6267                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6268                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6269             }
6270         }else{
6271             const int p0 = pix[-1];
6272             const int p1 = pix[-2];
6273             const int p2 = pix[-3];
6274
6275             const int q0 = pix[0];
6276             const int q1 = pix[1];
6277             const int q2 = pix[2];
6278
6279             if( FFABS( p0 - q0 ) < alpha &&
6280                 FFABS( p1 - p0 ) < beta &&
6281                 FFABS( q1 - q0 ) < beta ) {
6282
6283                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6284                     if( FFABS( p2 - p0 ) < beta)
6285                     {
6286                         const int p3 = pix[-4];
6287                         /* p0', p1', p2' */
6288                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6289                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6290                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6291                     } else {
6292                         /* p0' */
6293                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6294                     }
6295                     if( FFABS( q2 - q0 ) < beta)
6296                     {
6297                         const int q3 = pix[3];
6298                         /* q0', q1', q2' */
6299                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6300                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6301                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6302                     } else {
6303                         /* q0' */
6304                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6305                     }
6306                 }else{
6307                     /* p0', q0' */
6308                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6309                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6310                 }
6311                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6312             }
6313         }
6314     }
6315 }
6316 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6317     int i;
6318     for( i = 0; i < 8; i++, pix += stride) {
6319         int index_a;
6320         int alpha;
6321         int beta;
6322
6323         int qp_index;
6324         int bS_index = i;
6325
6326         if( bS[bS_index] == 0 ) {
6327             continue;
6328         }
6329
6330         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6331         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6332         alpha = (alpha_table+52)[index_a];
6333         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6334
6335         if( bS[bS_index] < 4 ) {
6336             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6337             const int p0 = pix[-1];
6338             const int p1 = pix[-2];
6339             const int q0 = pix[0];
6340             const int q1 = pix[1];
6341
6342             if( FFABS( p0 - q0 ) < alpha &&
6343                 FFABS( p1 - p0 ) < beta &&
6344                 FFABS( q1 - q0 ) < beta ) {
6345                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6346
6347                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6348                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6349                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6350             }
6351         }else{
6352             const int p0 = pix[-1];
6353             const int p1 = pix[-2];
6354             const int q0 = pix[0];
6355             const int q1 = pix[1];
6356
6357             if( FFABS( p0 - q0 ) < alpha &&
6358                 FFABS( p1 - p0 ) < beta &&
6359                 FFABS( q1 - q0 ) < beta ) {
6360
6361                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6362                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6363                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6364             }
6365         }
6366     }
6367 }
6368
6369 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6370     int i, d;
6371     const int index_a = qp + h->slice_alpha_c0_offset;
6372     const int alpha = (alpha_table+52)[index_a];
6373     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6374     const int pix_next  = stride;
6375
6376     if( bS[0] < 4 ) {
6377         int8_t tc[4];
6378         for(i=0; i<4; i++)
6379             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6380         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6381     } else {
6382         /* 16px edge length, see filter_mb_edgev */
6383             for( d = 0; d < 16; d++ ) {
6384                 const int p0 = pix[-1*pix_next];
6385                 const int p1 = pix[-2*pix_next];
6386                 const int p2 = pix[-3*pix_next];
6387                 const int q0 = pix[0];
6388                 const int q1 = pix[1*pix_next];
6389                 const int q2 = pix[2*pix_next];
6390
6391                 if( FFABS( p0 - q0 ) < alpha &&
6392                     FFABS( p1 - p0 ) < beta &&
6393                     FFABS( q1 - q0 ) < beta ) {
6394
6395                     const int p3 = pix[-4*pix_next];
6396                     const int q3 = pix[ 3*pix_next];
6397
6398                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6399                         if( FFABS( p2 - p0 ) < beta) {
6400                             /* p0', p1', p2' */
6401                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6402                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6403                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6404                         } else {
6405                             /* p0' */
6406                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6407                         }
6408                         if( FFABS( q2 - q0 ) < beta) {
6409                             /* q0', q1', q2' */
6410                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6411                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6412                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6413                         } else {
6414                             /* q0' */
6415                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6416                         }
6417                     }else{
6418                         /* p0', q0' */
6419                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6420                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6421                     }
6422                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6423                 }
6424                 pix++;
6425             }
6426     }
6427 }
6428
6429 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6430     int i;
6431     const int index_a = qp + h->slice_alpha_c0_offset;
6432     const int alpha = (alpha_table+52)[index_a];
6433     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6434
6435     if( bS[0] < 4 ) {
6436         int8_t tc[4];
6437         for(i=0; i<4; i++)
6438             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6439         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6440     } else {
6441         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6442     }
6443 }
6444
6445 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6446     MpegEncContext * const s = &h->s;
6447     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6448     int mb_xy, mb_type;
6449     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6450
6451     mb_xy = h->mb_xy;
6452
6453     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6454 1 ||
6455        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6456                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6457         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6458         return;
6459     }
6460     assert(!FRAME_MBAFF);
6461
6462     mb_type = s->current_picture.mb_type[mb_xy];
6463     qp = s->current_picture.qscale_table[mb_xy];
6464     qp0 = s->current_picture.qscale_table[mb_xy-1];
6465     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6466     qpc = get_chroma_qp( h, 0, qp );
6467     qpc0 = get_chroma_qp( h, 0, qp0 );
6468     qpc1 = get_chroma_qp( h, 0, qp1 );
6469     qp0 = (qp + qp0 + 1) >> 1;
6470     qp1 = (qp + qp1 + 1) >> 1;
6471     qpc0 = (qpc + qpc0 + 1) >> 1;
6472     qpc1 = (qpc + qpc1 + 1) >> 1;
6473     qp_thresh = 15 - h->slice_alpha_c0_offset;
6474     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6475        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6476         return;
6477
6478     if( IS_INTRA(mb_type) ) {
6479         int16_t bS4[4] = {4,4,4,4};
6480         int16_t bS3[4] = {3,3,3,3};
6481         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6482         if( IS_8x8DCT(mb_type) ) {
6483             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6484             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6485             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6486             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6487         } else {
6488             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6489             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6490             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6491             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6492             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6493             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6494             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6495             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6496         }
6497         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6498         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6499         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6500         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6501         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6502         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6503         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6504         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6505         return;
6506     } else {
6507         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6508         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6509         int edges;
6510         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6511             edges = 4;
6512             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6513         } else {
6514             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6515                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6516             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6517                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6518                              ? 3 : 0;
6519             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6520             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6521             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6522                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6523         }
6524         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6525             bSv[0][0] = 0x0004000400040004ULL;
6526         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6527             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6528
6529 #define FILTER(hv,dir,edge)\
6530         if(bSv[dir][edge]) {\
6531             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6532             if(!(edge&1)) {\
6533                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6534                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6535             }\
6536         }
6537         if( edges == 1 ) {
6538             FILTER(v,0,0);
6539             FILTER(h,1,0);
6540         } else if( IS_8x8DCT(mb_type) ) {
6541             FILTER(v,0,0);
6542             FILTER(v,0,2);
6543             FILTER(h,1,0);
6544             FILTER(h,1,2);
6545         } else {
6546             FILTER(v,0,0);
6547             FILTER(v,0,1);
6548             FILTER(v,0,2);
6549             FILTER(v,0,3);
6550             FILTER(h,1,0);
6551             FILTER(h,1,1);
6552             FILTER(h,1,2);
6553             FILTER(h,1,3);
6554         }
6555 #undef FILTER
6556     }
6557 }
6558
6559 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6560     MpegEncContext * const s = &h->s;
6561     const int mb_xy= mb_x + mb_y*s->mb_stride;
6562     const int mb_type = s->current_picture.mb_type[mb_xy];
6563     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6564     int first_vertical_edge_done = 0;
6565     int dir;
6566
6567     //for sufficiently low qp, filtering wouldn't do anything
6568     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6569     if(!FRAME_MBAFF){
6570         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6571         int qp = s->current_picture.qscale_table[mb_xy];
6572         if(qp <= qp_thresh
6573            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6574            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6575             return;
6576         }
6577     }
6578
6579     if (FRAME_MBAFF
6580             // left mb is in picture
6581             && h->slice_table[mb_xy-1] != 255
6582             // and current and left pair do not have the same interlaced type
6583             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6584             // and left mb is in the same slice if deblocking_filter == 2
6585             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6586         /* First vertical edge is different in MBAFF frames
6587          * There are 8 different bS to compute and 2 different Qp
6588          */
6589         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6590         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6591         int16_t bS[8];
6592         int qp[2];
6593         int bqp[2];
6594         int rqp[2];
6595         int mb_qp, mbn0_qp, mbn1_qp;
6596         int i;
6597         first_vertical_edge_done = 1;
6598
6599         if( IS_INTRA(mb_type) )
6600             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6601         else {
6602             for( i = 0; i < 8; i++ ) {
6603                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6604
6605                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6606                     bS[i] = 4;
6607                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6608                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6609                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6610                     bS[i] = 2;
6611                 else
6612                     bS[i] = 1;
6613             }
6614         }
6615
6616         mb_qp = s->current_picture.qscale_table[mb_xy];
6617         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6618         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6619         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6620         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6621                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6622         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6623                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6624         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6625         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6626                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6627         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6628                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6629
6630         /* Filter edge */
6631         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6632         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6633         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6634         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6635         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6636     }
6637     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6638     for( dir = 0; dir < 2; dir++ )
6639     {
6640         int edge;
6641         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6642         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6643         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6644         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6645         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6646
6647         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6648                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6649         // how often to recheck mv-based bS when iterating between edges
6650         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6651                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6652         // how often to recheck mv-based bS when iterating along each edge
6653         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6654
6655         if (first_vertical_edge_done) {
6656             start = 1;
6657             first_vertical_edge_done = 0;
6658         }
6659
6660         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6661             start = 1;
6662
6663         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6664             && !IS_INTERLACED(mb_type)
6665             && IS_INTERLACED(mbm_type)
6666             ) {
6667             // This is a special case in the norm where the filtering must
6668             // be done twice (one each of the field) even if we are in a
6669             // frame macroblock.
6670             //
6671             static const int nnz_idx[4] = {4,5,6,3};
6672             unsigned int tmp_linesize   = 2 *   linesize;
6673             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6674             int mbn_xy = mb_xy - 2 * s->mb_stride;
6675             int qp;
6676             int i, j;
6677             int16_t bS[4];
6678
6679             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6680                 if( IS_INTRA(mb_type) ||
6681                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6682                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6683                 } else {
6684                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6685                     for( i = 0; i < 4; i++ ) {
6686                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6687                             mbn_nnz[nnz_idx[i]] != 0 )
6688                             bS[i] = 2;
6689                         else
6690                             bS[i] = 1;
6691                     }
6692                 }
6693                 // Do not use s->qscale as luma quantizer because it has not the same
6694                 // value in IPCM macroblocks.
6695                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6696                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6697                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6698                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6699                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6700                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6701                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6702                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6703             }
6704
6705             start = 1;
6706         }
6707
6708         /* Calculate bS */
6709         for( edge = start; edge < edges; edge++ ) {
6710             /* mbn_xy: neighbor macroblock */
6711             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6712             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6713             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6714             int16_t bS[4];
6715             int qp;
6716
6717             if( (edge&1) && IS_8x8DCT(mb_type) )
6718                 continue;
6719
6720             if( IS_INTRA(mb_type) ||
6721                 IS_INTRA(mbn_type) ) {
6722                 int value;
6723                 if (edge == 0) {
6724                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6725                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6726                     ) {
6727                         value = 4;
6728                     } else {
6729                         value = 3;
6730                     }
6731                 } else {
6732                     value = 3;
6733                 }
6734                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6735             } else {
6736                 int i, l;
6737                 int mv_done;
6738
6739                 if( edge & mask_edge ) {
6740                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6741                     mv_done = 1;
6742                 }
6743                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6744                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6745                     mv_done = 1;
6746                 }
6747                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6748                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6749                     int bn_idx= b_idx - (dir ? 8:1);
6750                     int v = 0;
6751
6752                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6753                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6754                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6755                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6756                     }
6757
6758                     if(h->slice_type_nos == FF_B_TYPE && v){
6759                         v=0;
6760                         for( l = 0; !v && l < 2; l++ ) {
6761                             int ln= 1-l;
6762                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6763                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6764                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6765                         }
6766                     }
6767
6768                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6769                     mv_done = 1;
6770                 }
6771                 else
6772                     mv_done = 0;
6773
6774                 for( i = 0; i < 4; i++ ) {
6775                     int x = dir == 0 ? edge : i;
6776                     int y = dir == 0 ? i    : edge;
6777                     int b_idx= 8 + 4 + x + 8*y;
6778                     int bn_idx= b_idx - (dir ? 8:1);
6779
6780                     if( h->non_zero_count_cache[b_idx] != 0 ||
6781                         h->non_zero_count_cache[bn_idx] != 0 ) {
6782                         bS[i] = 2;
6783                     }
6784                     else if(!mv_done)
6785                     {
6786                         bS[i] = 0;
6787                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6788                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6789                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6790                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6791                                 bS[i] = 1;
6792                                 break;
6793                             }
6794                         }
6795
6796                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6797                             bS[i] = 0;
6798                             for( l = 0; l < 2; l++ ) {
6799                                 int ln= 1-l;
6800                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6801                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6802                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6803                                     bS[i] = 1;
6804                                     break;
6805                                 }
6806                             }
6807                         }
6808                     }
6809                 }
6810
6811                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6812                     continue;
6813             }
6814
6815             /* Filter edge */
6816             // Do not use s->qscale as luma quantizer because it has not the same
6817             // value in IPCM macroblocks.
6818             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6819             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6820             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6821             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6822             if( dir == 0 ) {
6823                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6824                 if( (edge&1) == 0 ) {
6825                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6826                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6827                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6828                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6829                 }
6830             } else {
6831                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6832                 if( (edge&1) == 0 ) {
6833                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6834                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6835                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6836                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6837                 }
6838             }
6839         }
6840     }
6841 }
6842
6843 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6844     MpegEncContext * const s = &h->s;
6845     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6846
6847     s->mb_skip_run= -1;
6848
6849     if( h->pps.cabac ) {
6850         int i;
6851
6852         /* realign */
6853         align_get_bits( &s->gb );
6854
6855         /* init cabac */
6856         ff_init_cabac_states( &h->cabac);
6857         ff_init_cabac_decoder( &h->cabac,
6858                                s->gb.buffer + get_bits_count(&s->gb)/8,
6859                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6860         /* calculate pre-state */
6861         for( i= 0; i < 460; i++ ) {
6862             int pre;
6863             if( h->slice_type_nos == FF_I_TYPE )
6864                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6865             else
6866                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6867
6868             if( pre <= 63 )
6869                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6870             else
6871                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6872         }
6873
6874         for(;;){
6875 //START_TIMER
6876             int ret = decode_mb_cabac(h);
6877             int eos;
6878 //STOP_TIMER("decode_mb_cabac")
6879
6880             if(ret>=0) hl_decode_mb(h);
6881
6882             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6883                 s->mb_y++;
6884
6885                 if(ret>=0) ret = decode_mb_cabac(h);
6886
6887                 if(ret>=0) hl_decode_mb(h);
6888                 s->mb_y--;
6889             }
6890             eos = get_cabac_terminate( &h->cabac );
6891
6892             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6893                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6894                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6895                 return -1;
6896             }
6897
6898             if( ++s->mb_x >= s->mb_width ) {
6899                 s->mb_x = 0;
6900                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6901                 ++s->mb_y;
6902                 if(FIELD_OR_MBAFF_PICTURE) {
6903                     ++s->mb_y;
6904                 }
6905             }
6906
6907             if( eos || s->mb_y >= s->mb_height ) {
6908                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6909                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6910                 return 0;
6911             }
6912         }
6913
6914     } else {
6915         for(;;){
6916             int ret = decode_mb_cavlc(h);
6917
6918             if(ret>=0) hl_decode_mb(h);
6919
6920             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6921                 s->mb_y++;
6922                 ret = decode_mb_cavlc(h);
6923
6924                 if(ret>=0) hl_decode_mb(h);
6925                 s->mb_y--;
6926             }
6927
6928             if(ret<0){
6929                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6930                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6931
6932                 return -1;
6933             }
6934
6935             if(++s->mb_x >= s->mb_width){
6936                 s->mb_x=0;
6937                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6938                 ++s->mb_y;
6939                 if(FIELD_OR_MBAFF_PICTURE) {
6940                     ++s->mb_y;
6941                 }
6942                 if(s->mb_y >= s->mb_height){
6943                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6944
6945                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6946                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6947
6948                         return 0;
6949                     }else{
6950                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6951
6952                         return -1;
6953                     }
6954                 }
6955             }
6956
6957             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6958                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6959                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6960                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6961
6962                     return 0;
6963                 }else{
6964                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6965
6966                     return -1;
6967                 }
6968             }
6969         }
6970     }
6971
6972 #if 0
6973     for(;s->mb_y < s->mb_height; s->mb_y++){
6974         for(;s->mb_x < s->mb_width; s->mb_x++){
6975             int ret= decode_mb(h);
6976
6977             hl_decode_mb(h);
6978
6979             if(ret<0){
6980                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6981                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6982
6983                 return -1;
6984             }
6985
6986             if(++s->mb_x >= s->mb_width){
6987                 s->mb_x=0;
6988                 if(++s->mb_y >= s->mb_height){
6989                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6990                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6991
6992                         return 0;
6993                     }else{
6994                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6995
6996                         return -1;
6997                     }
6998                 }
6999             }
7000
7001             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7002                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7003                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7004
7005                     return 0;
7006                 }else{
7007                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7008
7009                     return -1;
7010                 }
7011             }
7012         }
7013         s->mb_x=0;
7014         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7015     }
7016 #endif
7017     return -1; //not reached
7018 }
7019
7020 static int decode_unregistered_user_data(H264Context *h, int size){
7021     MpegEncContext * const s = &h->s;
7022     uint8_t user_data[16+256];
7023     int e, build, i;
7024
7025     if(size<16)
7026         return -1;
7027
7028     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7029         user_data[i]= get_bits(&s->gb, 8);
7030     }
7031
7032     user_data[i]= 0;
7033     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7034     if(e==1 && build>=0)
7035         h->x264_build= build;
7036
7037     if(s->avctx->debug & FF_DEBUG_BUGS)
7038         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7039
7040     for(; i<size; i++)
7041         skip_bits(&s->gb, 8);
7042
7043     return 0;
7044 }
7045
7046 static int decode_sei(H264Context *h){
7047     MpegEncContext * const s = &h->s;
7048
7049     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7050         int size, type;
7051
7052         type=0;
7053         do{
7054             type+= show_bits(&s->gb, 8);
7055         }while(get_bits(&s->gb, 8) == 255);
7056
7057         size=0;
7058         do{
7059             size+= show_bits(&s->gb, 8);
7060         }while(get_bits(&s->gb, 8) == 255);
7061
7062         switch(type){
7063         case 5:
7064             if(decode_unregistered_user_data(h, size) < 0)
7065                 return -1;
7066             break;
7067         default:
7068             skip_bits(&s->gb, 8*size);
7069         }
7070
7071         //FIXME check bits here
7072         align_get_bits(&s->gb);
7073     }
7074
7075     return 0;
7076 }
7077
7078 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7079     MpegEncContext * const s = &h->s;
7080     int cpb_count, i;
7081     cpb_count = get_ue_golomb(&s->gb) + 1;
7082     get_bits(&s->gb, 4); /* bit_rate_scale */
7083     get_bits(&s->gb, 4); /* cpb_size_scale */
7084     for(i=0; i<cpb_count; i++){
7085         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7086         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7087         get_bits1(&s->gb);     /* cbr_flag */
7088     }
7089     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7090     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7091     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7092     get_bits(&s->gb, 5); /* time_offset_length */
7093 }
7094
7095 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7096     MpegEncContext * const s = &h->s;
7097     int aspect_ratio_info_present_flag;
7098     unsigned int aspect_ratio_idc;
7099     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7100
7101     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7102
7103     if( aspect_ratio_info_present_flag ) {
7104         aspect_ratio_idc= get_bits(&s->gb, 8);
7105         if( aspect_ratio_idc == EXTENDED_SAR ) {
7106             sps->sar.num= get_bits(&s->gb, 16);
7107             sps->sar.den= get_bits(&s->gb, 16);
7108         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7109             sps->sar=  pixel_aspect[aspect_ratio_idc];
7110         }else{
7111             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7112             return -1;
7113         }
7114     }else{
7115         sps->sar.num=
7116         sps->sar.den= 0;
7117     }
7118 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7119
7120     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7121         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7122     }
7123
7124     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7125         get_bits(&s->gb, 3);    /* video_format */
7126         get_bits1(&s->gb);      /* video_full_range_flag */
7127         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7128             get_bits(&s->gb, 8); /* colour_primaries */
7129             get_bits(&s->gb, 8); /* transfer_characteristics */
7130             get_bits(&s->gb, 8); /* matrix_coefficients */
7131         }
7132     }
7133
7134     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7135         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7136         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7137     }
7138
7139     sps->timing_info_present_flag = get_bits1(&s->gb);
7140     if(sps->timing_info_present_flag){
7141         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7142         sps->time_scale = get_bits_long(&s->gb, 32);
7143         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7144     }
7145
7146     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7147     if(nal_hrd_parameters_present_flag)
7148         decode_hrd_parameters(h, sps);
7149     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7150     if(vcl_hrd_parameters_present_flag)
7151         decode_hrd_parameters(h, sps);
7152     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7153         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7154     get_bits1(&s->gb);         /* pic_struct_present_flag */
7155
7156     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7157     if(sps->bitstream_restriction_flag){
7158         unsigned int num_reorder_frames;
7159         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7160         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7161         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7162         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7163         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7164         num_reorder_frames= get_ue_golomb(&s->gb);
7165         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7166
7167         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7168             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7169             return -1;
7170         }
7171
7172         sps->num_reorder_frames= num_reorder_frames;
7173     }
7174
7175     return 0;
7176 }
7177
7178 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7179                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7180     MpegEncContext * const s = &h->s;
7181     int i, last = 8, next = 8;
7182     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7183     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7184         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7185     else
7186     for(i=0;i<size;i++){
7187         if(next)
7188             next = (last + get_se_golomb(&s->gb)) & 0xff;
7189         if(!i && !next){ /* matrix not written, we use the preset one */
7190             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7191             break;
7192         }
7193         last = factors[scan[i]] = next ? next : last;
7194     }
7195 }
7196
7197 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7198                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7199     MpegEncContext * const s = &h->s;
7200     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7201     const uint8_t *fallback[4] = {
7202         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7203         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7204         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7205         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7206     };
7207     if(get_bits1(&s->gb)){
7208         sps->scaling_matrix_present |= is_sps;
7209         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7210         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7211         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7212         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7213         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7214         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7215         if(is_sps || pps->transform_8x8_mode){
7216             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7217             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7218         }
7219     } else if(fallback_sps) {
7220         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7221         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7222     }
7223 }
7224
7225 /**
7226  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7227  */
7228 static void *
7229 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7230                     const size_t size, const char *name)
7231 {
7232     if(id>=max) {
7233         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7234         return NULL;
7235     }
7236
7237     if(!vec[id]) {
7238         vec[id] = av_mallocz(size);
7239         if(vec[id] == NULL)
7240             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7241     }
7242     return vec[id];
7243 }
7244
7245 static inline int decode_seq_parameter_set(H264Context *h){
7246     MpegEncContext * const s = &h->s;
7247     int profile_idc, level_idc;
7248     unsigned int sps_id, tmp, mb_width, mb_height;
7249     int i;
7250     SPS *sps;
7251
7252     profile_idc= get_bits(&s->gb, 8);
7253     get_bits1(&s->gb);   //constraint_set0_flag
7254     get_bits1(&s->gb);   //constraint_set1_flag
7255     get_bits1(&s->gb);   //constraint_set2_flag
7256     get_bits1(&s->gb);   //constraint_set3_flag
7257     get_bits(&s->gb, 4); // reserved
7258     level_idc= get_bits(&s->gb, 8);
7259     sps_id= get_ue_golomb(&s->gb);
7260
7261     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7262     if(sps == NULL)
7263         return -1;
7264
7265     sps->profile_idc= profile_idc;
7266     sps->level_idc= level_idc;
7267
7268     if(sps->profile_idc >= 100){ //high profile
7269         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7270             get_bits1(&s->gb);  //residual_color_transform_flag
7271         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7272         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7273         sps->transform_bypass = get_bits1(&s->gb);
7274         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7275     }else
7276         sps->scaling_matrix_present = 0;
7277
7278     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7279     sps->poc_type= get_ue_golomb(&s->gb);
7280
7281     if(sps->poc_type == 0){ //FIXME #define
7282         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7283     } else if(sps->poc_type == 1){//FIXME #define
7284         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7285         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7286         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7287         tmp= get_ue_golomb(&s->gb);
7288
7289         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7290             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7291             return -1;
7292         }
7293         sps->poc_cycle_length= tmp;
7294
7295         for(i=0; i<sps->poc_cycle_length; i++)
7296             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7297     }else if(sps->poc_type != 2){
7298         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7299         return -1;
7300     }
7301
7302     tmp= get_ue_golomb(&s->gb);
7303     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7304         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7305         return -1;
7306     }
7307     sps->ref_frame_count= tmp;
7308     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7309     mb_width= get_ue_golomb(&s->gb) + 1;
7310     mb_height= get_ue_golomb(&s->gb) + 1;
7311     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7312        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7313         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7314         return -1;
7315     }
7316     sps->mb_width = mb_width;
7317     sps->mb_height= mb_height;
7318
7319     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7320     if(!sps->frame_mbs_only_flag)
7321         sps->mb_aff= get_bits1(&s->gb);
7322     else
7323         sps->mb_aff= 0;
7324
7325     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7326
7327 #ifndef ALLOW_INTERLACE
7328     if(sps->mb_aff)
7329         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7330 #endif
7331     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7332         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7333
7334     sps->crop= get_bits1(&s->gb);
7335     if(sps->crop){
7336         sps->crop_left  = get_ue_golomb(&s->gb);
7337         sps->crop_right = get_ue_golomb(&s->gb);
7338         sps->crop_top   = get_ue_golomb(&s->gb);
7339         sps->crop_bottom= get_ue_golomb(&s->gb);
7340         if(sps->crop_left || sps->crop_top){
7341             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7342         }
7343         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7344             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7345         }
7346     }else{
7347         sps->crop_left  =
7348         sps->crop_right =
7349         sps->crop_top   =
7350         sps->crop_bottom= 0;
7351     }
7352
7353     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7354     if( sps->vui_parameters_present_flag )
7355         decode_vui_parameters(h, sps);
7356
7357     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7358         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7359                sps_id, sps->profile_idc, sps->level_idc,
7360                sps->poc_type,
7361                sps->ref_frame_count,
7362                sps->mb_width, sps->mb_height,
7363                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7364                sps->direct_8x8_inference_flag ? "8B8" : "",
7365                sps->crop_left, sps->crop_right,
7366                sps->crop_top, sps->crop_bottom,
7367                sps->vui_parameters_present_flag ? "VUI" : ""
7368                );
7369     }
7370     return 0;
7371 }
7372
7373 static void
7374 build_qp_table(PPS *pps, int t, int index)
7375 {
7376     int i;
7377     for(i = 0; i < 52; i++)
7378         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7379 }
7380
7381 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7382     MpegEncContext * const s = &h->s;
7383     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7384     PPS *pps;
7385
7386     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7387     if(pps == NULL)
7388         return -1;
7389
7390     tmp= get_ue_golomb(&s->gb);
7391     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7392         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7393         return -1;
7394     }
7395     pps->sps_id= tmp;
7396
7397     pps->cabac= get_bits1(&s->gb);
7398     pps->pic_order_present= get_bits1(&s->gb);
7399     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7400     if(pps->slice_group_count > 1 ){
7401         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7402         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7403         switch(pps->mb_slice_group_map_type){
7404         case 0:
7405 #if 0
7406 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7407 |    run_length[ i ]                                |1  |ue(v)   |
7408 #endif
7409             break;
7410         case 2:
7411 #if 0
7412 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7413 |{                                                  |   |        |
7414 |    top_left_mb[ i ]                               |1  |ue(v)   |
7415 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7416 |   }                                               |   |        |
7417 #endif
7418             break;
7419         case 3:
7420         case 4:
7421         case 5:
7422 #if 0
7423 |   slice_group_change_direction_flag               |1  |u(1)    |
7424 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7425 #endif
7426             break;
7427         case 6:
7428 #if 0
7429 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7430 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7431 |)                                                  |   |        |
7432 |    slice_group_id[ i ]                            |1  |u(v)    |
7433 #endif
7434             break;
7435         }
7436     }
7437     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7438     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7439     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7440         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7441         pps->ref_count[0]= pps->ref_count[1]= 1;
7442         return -1;
7443     }
7444
7445     pps->weighted_pred= get_bits1(&s->gb);
7446     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7447     pps->init_qp= get_se_golomb(&s->gb) + 26;
7448     pps->init_qs= get_se_golomb(&s->gb) + 26;
7449     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7450     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7451     pps->constrained_intra_pred= get_bits1(&s->gb);
7452     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7453
7454     pps->transform_8x8_mode= 0;
7455     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7456     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7457     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7458
7459     if(get_bits_count(&s->gb) < bit_length){
7460         pps->transform_8x8_mode= get_bits1(&s->gb);
7461         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7462         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7463     } else {
7464         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7465     }
7466
7467     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7468     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7469     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7470         h->pps.chroma_qp_diff= 1;
7471
7472     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7473         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7474                pps_id, pps->sps_id,
7475                pps->cabac ? "CABAC" : "CAVLC",
7476                pps->slice_group_count,
7477                pps->ref_count[0], pps->ref_count[1],
7478                pps->weighted_pred ? "weighted" : "",
7479                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7480                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7481                pps->constrained_intra_pred ? "CONSTR" : "",
7482                pps->redundant_pic_cnt_present ? "REDU" : "",
7483                pps->transform_8x8_mode ? "8x8DCT" : ""
7484                );
7485     }
7486
7487     return 0;
7488 }
7489
7490 /**
7491  * Call decode_slice() for each context.
7492  *
7493  * @param h h264 master context
7494  * @param context_count number of contexts to execute
7495  */
7496 static void execute_decode_slices(H264Context *h, int context_count){
7497     MpegEncContext * const s = &h->s;
7498     AVCodecContext * const avctx= s->avctx;
7499     H264Context *hx;
7500     int i;
7501
7502     if(context_count == 1) {
7503         decode_slice(avctx, h);
7504     } else {
7505         for(i = 1; i < context_count; i++) {
7506             hx = h->thread_context[i];
7507             hx->s.error_resilience = avctx->error_resilience;
7508             hx->s.error_count = 0;
7509         }
7510
7511         avctx->execute(avctx, (void *)decode_slice,
7512                        (void **)h->thread_context, NULL, context_count);
7513
7514         /* pull back stuff from slices to master context */
7515         hx = h->thread_context[context_count - 1];
7516         s->mb_x = hx->s.mb_x;
7517         s->mb_y = hx->s.mb_y;
7518         s->dropable = hx->s.dropable;
7519         s->picture_structure = hx->s.picture_structure;
7520         for(i = 1; i < context_count; i++)
7521             h->s.error_count += h->thread_context[i]->s.error_count;
7522     }
7523 }
7524
7525
7526 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7527     MpegEncContext * const s = &h->s;
7528     AVCodecContext * const avctx= s->avctx;
7529     int buf_index=0;
7530     H264Context *hx; ///< thread context
7531     int context_count = 0;
7532
7533     h->max_contexts = avctx->thread_count;
7534 #if 0
7535     int i;
7536     for(i=0; i<50; i++){
7537         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7538     }
7539 #endif
7540     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7541         h->current_slice = 0;
7542         if (!s->first_field)
7543             s->current_picture_ptr= NULL;
7544     }
7545
7546     for(;;){
7547         int consumed;
7548         int dst_length;
7549         int bit_length;
7550         const uint8_t *ptr;
7551         int i, nalsize = 0;
7552         int err;
7553
7554         if(h->is_avc) {
7555             if(buf_index >= buf_size) break;
7556             nalsize = 0;
7557             for(i = 0; i < h->nal_length_size; i++)
7558                 nalsize = (nalsize << 8) | buf[buf_index++];
7559             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7560                 if(nalsize == 1){
7561                     buf_index++;
7562                     continue;
7563                 }else{
7564                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7565                     break;
7566                 }
7567             }
7568         } else {
7569             // start code prefix search
7570             for(; buf_index + 3 < buf_size; buf_index++){
7571                 // This should always succeed in the first iteration.
7572                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7573                     break;
7574             }
7575
7576             if(buf_index+3 >= buf_size) break;
7577
7578             buf_index+=3;
7579         }
7580
7581         hx = h->thread_context[context_count];
7582
7583         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7584         if (ptr==NULL || dst_length < 0){
7585             return -1;
7586         }
7587         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7588             dst_length--;
7589         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7590
7591         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7592             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7593         }
7594
7595         if (h->is_avc && (nalsize != consumed)){
7596             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7597             consumed= nalsize;
7598         }
7599
7600         buf_index += consumed;
7601
7602         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7603            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7604             continue;
7605
7606       again:
7607         err = 0;
7608         switch(hx->nal_unit_type){
7609         case NAL_IDR_SLICE:
7610             if (h->nal_unit_type != NAL_IDR_SLICE) {
7611                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7612                 return -1;
7613             }
7614             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7615         case NAL_SLICE:
7616             init_get_bits(&hx->s.gb, ptr, bit_length);
7617             hx->intra_gb_ptr=
7618             hx->inter_gb_ptr= &hx->s.gb;
7619             hx->s.data_partitioning = 0;
7620
7621             if((err = decode_slice_header(hx, h)))
7622                break;
7623
7624             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7625             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7626                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7627                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7628                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7629                && avctx->skip_frame < AVDISCARD_ALL)
7630                 context_count++;
7631             break;
7632         case NAL_DPA:
7633             init_get_bits(&hx->s.gb, ptr, bit_length);
7634             hx->intra_gb_ptr=
7635             hx->inter_gb_ptr= NULL;
7636             hx->s.data_partitioning = 1;
7637
7638             err = decode_slice_header(hx, h);
7639             break;
7640         case NAL_DPB:
7641             init_get_bits(&hx->intra_gb, ptr, bit_length);
7642             hx->intra_gb_ptr= &hx->intra_gb;
7643             break;
7644         case NAL_DPC:
7645             init_get_bits(&hx->inter_gb, ptr, bit_length);
7646             hx->inter_gb_ptr= &hx->inter_gb;
7647
7648             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7649                && s->context_initialized
7650                && s->hurry_up < 5
7651                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7652                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7653                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7654                && avctx->skip_frame < AVDISCARD_ALL)
7655                 context_count++;
7656             break;
7657         case NAL_SEI:
7658             init_get_bits(&s->gb, ptr, bit_length);
7659             decode_sei(h);
7660             break;
7661         case NAL_SPS:
7662             init_get_bits(&s->gb, ptr, bit_length);
7663             decode_seq_parameter_set(h);
7664
7665             if(s->flags& CODEC_FLAG_LOW_DELAY)
7666                 s->low_delay=1;
7667
7668             if(avctx->has_b_frames < 2)
7669                 avctx->has_b_frames= !s->low_delay;
7670             break;
7671         case NAL_PPS:
7672             init_get_bits(&s->gb, ptr, bit_length);
7673
7674             decode_picture_parameter_set(h, bit_length);
7675
7676             break;
7677         case NAL_AUD:
7678         case NAL_END_SEQUENCE:
7679         case NAL_END_STREAM:
7680         case NAL_FILLER_DATA:
7681         case NAL_SPS_EXT:
7682         case NAL_AUXILIARY_SLICE:
7683             break;
7684         default:
7685             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7686         }
7687
7688         if(context_count == h->max_contexts) {
7689             execute_decode_slices(h, context_count);
7690             context_count = 0;
7691         }
7692
7693         if (err < 0)
7694             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7695         else if(err == 1) {
7696             /* Slice could not be decoded in parallel mode, copy down
7697              * NAL unit stuff to context 0 and restart. Note that
7698              * rbsp_buffer is not transfered, but since we no longer
7699              * run in parallel mode this should not be an issue. */
7700             h->nal_unit_type = hx->nal_unit_type;
7701             h->nal_ref_idc   = hx->nal_ref_idc;
7702             hx = h;
7703             goto again;
7704         }
7705     }
7706     if(context_count)
7707         execute_decode_slices(h, context_count);
7708     return buf_index;
7709 }
7710
7711 /**
7712  * returns the number of bytes consumed for building the current frame
7713  */
7714 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7715     if(s->flags&CODEC_FLAG_TRUNCATED){
7716         pos -= s->parse_context.last_index;
7717         if(pos<0) pos=0; // FIXME remove (unneeded?)
7718
7719         return pos;
7720     }else{
7721         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7722         if(pos+10>buf_size) pos=buf_size; // oops ;)
7723
7724         return pos;
7725     }
7726 }
7727
7728 static int decode_frame(AVCodecContext *avctx,
7729                              void *data, int *data_size,
7730                              const uint8_t *buf, int buf_size)
7731 {
7732     H264Context *h = avctx->priv_data;
7733     MpegEncContext *s = &h->s;
7734     AVFrame *pict = data;
7735     int buf_index;
7736
7737     s->flags= avctx->flags;
7738     s->flags2= avctx->flags2;
7739
7740     if(s->flags&CODEC_FLAG_TRUNCATED){
7741         const int next= ff_h264_find_frame_end(h, buf, buf_size);
7742         assert((buf_size > 0) || (next == END_NOT_FOUND));
7743
7744         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7745           return buf_size;
7746 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7747     }
7748
7749    /* no supplementary picture */
7750     if (buf_size == 0) {
7751         Picture *out;
7752         int i, out_idx;
7753
7754 //FIXME factorize this with the output code below
7755         out = h->delayed_pic[0];
7756         out_idx = 0;
7757         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7758             if(h->delayed_pic[i]->poc < out->poc){
7759                 out = h->delayed_pic[i];
7760                 out_idx = i;
7761             }
7762
7763         for(i=out_idx; h->delayed_pic[i]; i++)
7764             h->delayed_pic[i] = h->delayed_pic[i+1];
7765
7766         if(out){
7767             *data_size = sizeof(AVFrame);
7768             *pict= *(AVFrame*)out;
7769         }
7770
7771         return 0;
7772     }
7773
7774     if(h->is_avc && !h->got_avcC) {
7775         int i, cnt, nalsize;
7776         unsigned char *p = avctx->extradata;
7777         if(avctx->extradata_size < 7) {
7778             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7779             return -1;
7780         }
7781         if(*p != 1) {
7782             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7783             return -1;
7784         }
7785         /* sps and pps in the avcC always have length coded with 2 bytes,
7786            so put a fake nal_length_size = 2 while parsing them */
7787         h->nal_length_size = 2;
7788         // Decode sps from avcC
7789         cnt = *(p+5) & 0x1f; // Number of sps
7790         p += 6;
7791         for (i = 0; i < cnt; i++) {
7792             nalsize = AV_RB16(p) + 2;
7793             if(decode_nal_units(h, p, nalsize) < 0) {
7794                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7795                 return -1;
7796             }
7797             p += nalsize;
7798         }
7799         // Decode pps from avcC
7800         cnt = *(p++); // Number of pps
7801         for (i = 0; i < cnt; i++) {
7802             nalsize = AV_RB16(p) + 2;
7803             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7804                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7805                 return -1;
7806             }
7807             p += nalsize;
7808         }
7809         // Now store right nal length size, that will be use to parse all other nals
7810         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7811         // Do not reparse avcC
7812         h->got_avcC = 1;
7813     }
7814
7815     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7816         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7817             return -1;
7818     }
7819
7820     buf_index=decode_nal_units(h, buf, buf_size);
7821     if(buf_index < 0)
7822         return -1;
7823
7824     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7825         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7826         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7827         return -1;
7828     }
7829
7830     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7831         Picture *out = s->current_picture_ptr;
7832         Picture *cur = s->current_picture_ptr;
7833         int i, pics, cross_idr, out_of_order, out_idx;
7834
7835         s->mb_y= 0;
7836
7837         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7838         s->current_picture_ptr->pict_type= s->pict_type;
7839
7840         if(!s->dropable) {
7841             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7842             h->prev_poc_msb= h->poc_msb;
7843             h->prev_poc_lsb= h->poc_lsb;
7844         }
7845         h->prev_frame_num_offset= h->frame_num_offset;
7846         h->prev_frame_num= h->frame_num;
7847
7848         /*
7849          * FIXME: Error handling code does not seem to support interlaced
7850          * when slices span multiple rows
7851          * The ff_er_add_slice calls don't work right for bottom
7852          * fields; they cause massive erroneous error concealing
7853          * Error marking covers both fields (top and bottom).
7854          * This causes a mismatched s->error_count
7855          * and a bad error table. Further, the error count goes to
7856          * INT_MAX when called for bottom field, because mb_y is
7857          * past end by one (callers fault) and resync_mb_y != 0
7858          * causes problems for the first MB line, too.
7859          */
7860         if (!FIELD_PICTURE)
7861             ff_er_frame_end(s);
7862
7863         MPV_frame_end(s);
7864
7865         if (s->first_field) {
7866             /* Wait for second field. */
7867             *data_size = 0;
7868
7869         } else {
7870             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7871             /* Derive top_field_first from field pocs. */
7872             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7873
7874         //FIXME do something with unavailable reference frames
7875
7876             /* Sort B-frames into display order */
7877
7878             if(h->sps.bitstream_restriction_flag
7879                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7880                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7881                 s->low_delay = 0;
7882             }
7883
7884             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7885                && !h->sps.bitstream_restriction_flag){
7886                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7887                 s->low_delay= 0;
7888             }
7889
7890             pics = 0;
7891             while(h->delayed_pic[pics]) pics++;
7892
7893             assert(pics <= MAX_DELAYED_PIC_COUNT);
7894
7895             h->delayed_pic[pics++] = cur;
7896             if(cur->reference == 0)
7897                 cur->reference = DELAYED_PIC_REF;
7898
7899             cross_idr = 0;
7900             for(i=0; h->delayed_pic[i]; i++)
7901                 if(h->delayed_pic[i]->poc==0)
7902                     cross_idr = 1;
7903
7904             out = h->delayed_pic[0];
7905             out_idx = 0;
7906             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7907                 if(h->delayed_pic[i]->poc < out->poc){
7908                     out = h->delayed_pic[i];
7909                     out_idx = i;
7910                 }
7911
7912             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7913
7914             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7915                 { }
7916             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7917                || (s->low_delay &&
7918                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7919                  || cur->pict_type == FF_B_TYPE)))
7920             {
7921                 s->low_delay = 0;
7922                 s->avctx->has_b_frames++;
7923             }
7924
7925             if(out_of_order || pics > s->avctx->has_b_frames){
7926                 out->reference &= ~DELAYED_PIC_REF;
7927                 for(i=out_idx; h->delayed_pic[i]; i++)
7928                     h->delayed_pic[i] = h->delayed_pic[i+1];
7929             }
7930             if(!out_of_order && pics > s->avctx->has_b_frames){
7931                 *data_size = sizeof(AVFrame);
7932
7933                 h->outputed_poc = out->poc;
7934                 *pict= *(AVFrame*)out;
7935             }else{
7936                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7937             }
7938         }
7939     }
7940
7941     assert(pict->data[0] || !*data_size);
7942     ff_print_debug_info(s, pict);
7943 //printf("out %d\n", (int)pict->data[0]);
7944 #if 0 //?
7945
7946     /* Return the Picture timestamp as the frame number */
7947     /* we subtract 1 because it is added on utils.c     */
7948     avctx->frame_number = s->picture_number - 1;
7949 #endif
7950     return get_consumed_bytes(s, buf_index, buf_size);
7951 }
7952 #if 0
7953 static inline void fill_mb_avail(H264Context *h){
7954     MpegEncContext * const s = &h->s;
7955     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7956
7957     if(s->mb_y){
7958         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7959         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7960         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7961     }else{
7962         h->mb_avail[0]=
7963         h->mb_avail[1]=
7964         h->mb_avail[2]= 0;
7965     }
7966     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7967     h->mb_avail[4]= 1; //FIXME move out
7968     h->mb_avail[5]= 0; //FIXME move out
7969 }
7970 #endif
7971
7972 #ifdef TEST
7973 #undef printf
7974 #undef random
7975 #define COUNT 8000
7976 #define SIZE (COUNT*40)
7977 int main(void){
7978     int i;
7979     uint8_t temp[SIZE];
7980     PutBitContext pb;
7981     GetBitContext gb;
7982 //    int int_temp[10000];
7983     DSPContext dsp;
7984     AVCodecContext avctx;
7985
7986     dsputil_init(&dsp, &avctx);
7987
7988     init_put_bits(&pb, temp, SIZE);
7989     printf("testing unsigned exp golomb\n");
7990     for(i=0; i<COUNT; i++){
7991         START_TIMER
7992         set_ue_golomb(&pb, i);
7993         STOP_TIMER("set_ue_golomb");
7994     }
7995     flush_put_bits(&pb);
7996
7997     init_get_bits(&gb, temp, 8*SIZE);
7998     for(i=0; i<COUNT; i++){
7999         int j, s;
8000
8001         s= show_bits(&gb, 24);
8002
8003         START_TIMER
8004         j= get_ue_golomb(&gb);
8005         if(j != i){
8006             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8007 //            return -1;
8008         }
8009         STOP_TIMER("get_ue_golomb");
8010     }
8011
8012
8013     init_put_bits(&pb, temp, SIZE);
8014     printf("testing signed exp golomb\n");
8015     for(i=0; i<COUNT; i++){
8016         START_TIMER
8017         set_se_golomb(&pb, i - COUNT/2);
8018         STOP_TIMER("set_se_golomb");
8019     }
8020     flush_put_bits(&pb);
8021
8022     init_get_bits(&gb, temp, 8*SIZE);
8023     for(i=0; i<COUNT; i++){
8024         int j, s;
8025
8026         s= show_bits(&gb, 24);
8027
8028         START_TIMER
8029         j= get_se_golomb(&gb);
8030         if(j != i - COUNT/2){
8031             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8032 //            return -1;
8033         }
8034         STOP_TIMER("get_se_golomb");
8035     }
8036
8037 #if 0
8038     printf("testing 4x4 (I)DCT\n");
8039
8040     DCTELEM block[16];
8041     uint8_t src[16], ref[16];
8042     uint64_t error= 0, max_error=0;
8043
8044     for(i=0; i<COUNT; i++){
8045         int j;
8046 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8047         for(j=0; j<16; j++){
8048             ref[j]= random()%255;
8049             src[j]= random()%255;
8050         }
8051
8052         h264_diff_dct_c(block, src, ref, 4);
8053
8054         //normalize
8055         for(j=0; j<16; j++){
8056 //            printf("%d ", block[j]);
8057             block[j]= block[j]*4;
8058             if(j&1) block[j]= (block[j]*4 + 2)/5;
8059             if(j&4) block[j]= (block[j]*4 + 2)/5;
8060         }
8061 //        printf("\n");
8062
8063         s->dsp.h264_idct_add(ref, block, 4);
8064 /*        for(j=0; j<16; j++){
8065             printf("%d ", ref[j]);
8066         }
8067         printf("\n");*/
8068
8069         for(j=0; j<16; j++){
8070             int diff= FFABS(src[j] - ref[j]);
8071
8072             error+= diff*diff;
8073             max_error= FFMAX(max_error, diff);
8074         }
8075     }
8076     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8077     printf("testing quantizer\n");
8078     for(qp=0; qp<52; qp++){
8079         for(i=0; i<16; i++)
8080             src1_block[i]= src2_block[i]= random()%255;
8081
8082     }
8083     printf("Testing NAL layer\n");
8084
8085     uint8_t bitstream[COUNT];
8086     uint8_t nal[COUNT*2];
8087     H264Context h;
8088     memset(&h, 0, sizeof(H264Context));
8089
8090     for(i=0; i<COUNT; i++){
8091         int zeros= i;
8092         int nal_length;
8093         int consumed;
8094         int out_length;
8095         uint8_t *out;
8096         int j;
8097
8098         for(j=0; j<COUNT; j++){
8099             bitstream[j]= (random() % 255) + 1;
8100         }
8101
8102         for(j=0; j<zeros; j++){
8103             int pos= random() % COUNT;
8104             while(bitstream[pos] == 0){
8105                 pos++;
8106                 pos %= COUNT;
8107             }
8108             bitstream[pos]=0;
8109         }
8110
8111         START_TIMER
8112
8113         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8114         if(nal_length<0){
8115             printf("encoding failed\n");
8116             return -1;
8117         }
8118
8119         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8120
8121         STOP_TIMER("NAL")
8122
8123         if(out_length != COUNT){
8124             printf("incorrect length %d %d\n", out_length, COUNT);
8125             return -1;
8126         }
8127
8128         if(consumed != nal_length){
8129             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8130             return -1;
8131         }
8132
8133         if(memcmp(bitstream, out, COUNT)){
8134             printf("mismatch\n");
8135             return -1;
8136         }
8137     }
8138 #endif
8139
8140     printf("Testing RBSP\n");
8141
8142
8143     return 0;
8144 }
8145 #endif /* TEST */
8146
8147
8148 static av_cold int decode_end(AVCodecContext *avctx)
8149 {
8150     H264Context *h = avctx->priv_data;
8151     MpegEncContext *s = &h->s;
8152
8153     av_freep(&h->rbsp_buffer[0]);
8154     av_freep(&h->rbsp_buffer[1]);
8155     free_tables(h); //FIXME cleanup init stuff perhaps
8156     MPV_common_end(s);
8157
8158 //    memset(h, 0, sizeof(H264Context));
8159
8160     return 0;
8161 }
8162
8163
8164 AVCodec h264_decoder = {
8165     "h264",
8166     CODEC_TYPE_VIDEO,
8167     CODEC_ID_H264,
8168     sizeof(H264Context),
8169     decode_init,
8170     NULL,
8171     decode_end,
8172     decode_frame,
8173     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8174     .flush= flush_dpb,
8175     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8176 };
8177
8178 #include "svq3.c"