git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= h->mb_xy;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     const int mb_xy= h->mb_xy;
 554
 555     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 556     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 557     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 558     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 559     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 560     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 561     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 562 }
 563
 564 /**
 565  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 566  */
 567 static inline int check_intra4x4_pred_mode(H264Context *h){
 568     MpegEncContext * const s = &h->s;
 569     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 570     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 571     int i;
 572
 573     if(!(h->top_samples_available&0x8000)){
 574         for(i=0; i<4; i++){
 575             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 576             if(status<0){
 577                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 578                 return -1;
 579             } else if(status){
 580                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 581             }
 582         }
 583     }
 584
 585     if(!(h->left_samples_available&0x8000)){
 586         for(i=0; i<4; i++){
 587             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 588             if(status<0){
 589                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 590                 return -1;
 591             } else if(status){
 592                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if(!(h->left_samples_available&0x8000)){
 622         mode= left[ mode ];
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665
 666     if(FRAME_MBAFF){
 667         // store all luma nnzs, for deblocking
 668         int v = 0, i;
 669         for(i=0; i<16; i++)
 670             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 671         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 672     }
 673 }
 674
 675 /**
 676  * gets the predicted number of non zero coefficients.
 677  * @param n block index
 678  */
 679 static inline int pred_non_zero_count(H264Context *h, int n){
 680     const int index8= scan8[n];
 681     const int left= h->non_zero_count_cache[index8 - 1];
 682     const int top = h->non_zero_count_cache[index8 - 8];
 683     int i= left + top;
 684
 685     if(i<64) i= (i+1)>>1;
 686
 687     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 688
 689     return i&31;
 690 }
 691
 692 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 693     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 694     MpegEncContext *s = &h->s;
 695
 696     /* there is no consistent mapping of mvs to neighboring locations that will
 697      * make mbaff happy, so we can't move all this logic to fill_caches */
 698     if(FRAME_MBAFF){
 699         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 700         const int16_t *mv;
 701         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 702         *C = h->mv_cache[list][scan8[0]-2];
 703
 704         if(!MB_FIELD
 705            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 706             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 707             if(IS_INTERLACED(mb_types[topright_xy])){
 708 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 709                 const int x4 = X4, y4 = Y4;\
 710                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 711                 if(!USES_LIST(mb_type,list))\
 712                     return LIST_NOT_USED;\
 713                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 714                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 715                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 716                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 717
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 719             }
 720         }
 721         if(topright_ref == PART_NOT_AVAILABLE
 722            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 723            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 724             if(!MB_FIELD
 725                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 726                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 727             }
 728             if(MB_FIELD
 729                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 730                && i >= scan8[0]+8){
 731                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 732                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 733             }
 734         }
 735 #undef SET_DIAG_MV
 736     }
 737
 738     if(topright_ref != PART_NOT_AVAILABLE){
 739         *C= h->mv_cache[list][ i - 8 + part_width ];
 740         return topright_ref;
 741     }else{
 742         tprintf(s->avctx, "topright MV not available\n");
 743
 744         *C= h->mv_cache[list][ i - 8 - 1 ];
 745         return h->ref_cache[list][ i - 8 - 1 ];
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted MV.
 751  * @param n the block index
 752  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 753  * @param mx the x component of the predicted motion vector
 754  * @param my the y component of the predicted motion vector
 755  */
 756 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 757     const int index8= scan8[n];
 758     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 759     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 760     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 761     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 762     const int16_t * C;
 763     int diagonal_ref, match_count;
 764
 765     assert(part_width==1 || part_width==2 || part_width==4);
 766
 767 /* mv_cache
 768   B . . A T T T T
 769   U . . L . . , .
 770   U . . L . . . .
 771   U . . L . . , .
 772   . . . L . . . .
 773 */
 774
 775     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 776     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 777     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 778     if(match_count > 1){ //most common
 779         *mx= mid_pred(A[0], B[0], C[0]);
 780         *my= mid_pred(A[1], B[1], C[1]);
 781     }else if(match_count==1){
 782         if(left_ref==ref){
 783             *mx= A[0];
 784             *my= A[1];
 785         }else if(top_ref==ref){
 786             *mx= B[0];
 787             *my= B[1];
 788         }else{
 789             *mx= C[0];
 790             *my= C[1];
 791         }
 792     }else{
 793         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 794             *mx= A[0];
 795             *my= A[1];
 796         }else{
 797             *mx= mid_pred(A[0], B[0], C[0]);
 798             *my= mid_pred(A[1], B[1], C[1]);
 799         }
 800     }
 801
 802     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 803 }
 804
 805 /**
 806  * gets the directionally predicted 16x8 MV.
 807  * @param n the block index
 808  * @param mx the x component of the predicted motion vector
 809  * @param my the y component of the predicted motion vector
 810  */
 811 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 812     if(n==0){
 813         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 814         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 815
 816         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 817
 818         if(top_ref == ref){
 819             *mx= B[0];
 820             *my= B[1];
 821             return;
 822         }
 823     }else{
 824         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 825         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 826
 827         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 828
 829         if(left_ref == ref){
 830             *mx= A[0];
 831             *my= A[1];
 832             return;
 833         }
 834     }
 835
 836     //RARE
 837     pred_motion(h, n, 4, list, ref, mx, my);
 838 }
 839
 840 /**
 841  * gets the directionally predicted 8x16 MV.
 842  * @param n the block index
 843  * @param mx the x component of the predicted motion vector
 844  * @param my the y component of the predicted motion vector
 845  */
 846 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 847     if(n==0){
 848         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 849         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 850
 851         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 852
 853         if(left_ref == ref){
 854             *mx= A[0];
 855             *my= A[1];
 856             return;
 857         }
 858     }else{
 859         const int16_t * C;
 860         int diagonal_ref;
 861
 862         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 863
 864         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 865
 866         if(diagonal_ref == ref){
 867             *mx= C[0];
 868             *my= C[1];
 869             return;
 870         }
 871     }
 872
 873     //RARE
 874     pred_motion(h, n, 2, list, ref, mx, my);
 875 }
 876
 877 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 878     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 879     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 880
 881     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 882
 883     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 884        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 885        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 886
 887         *mx = *my = 0;
 888         return;
 889     }
 890
 891     pred_motion(h, 0, 4, 0, 0, mx, my);
 892
 893     return;
 894 }
 895
 896 static inline void direct_dist_scale_factor(H264Context * const h){
 897     const int poc = h->s.current_picture_ptr->poc;
 898     const int poc1 = h->ref_list[1][0].poc;
 899     int i;
 900     for(i=0; i<h->ref_count[0]; i++){
 901         int poc0 = h->ref_list[0][i].poc;
 902         int td = av_clip(poc1 - poc0, -128, 127);
 903         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 904             h->dist_scale_factor[i] = 256;
 905         }else{
 906             int tb = av_clip(poc - poc0, -128, 127);
 907             int tx = (16384 + (FFABS(td) >> 1)) / td;
 908             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 909         }
 910     }
 911     if(FRAME_MBAFF){
 912         for(i=0; i<h->ref_count[0]; i++){
 913             h->dist_scale_factor_field[2*i] =
 914             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 915         }
 916     }
 917 }
 918 static inline void direct_ref_list_init(H264Context * const h){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     Picture * const cur = s->current_picture_ptr;
 922     int list, i, j;
 923     if(cur->pict_type == FF_I_TYPE)
 924         cur->ref_count[0] = 0;
 925     if(cur->pict_type != FF_B_TYPE)
 926         cur->ref_count[1] = 0;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 931     }
 932     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 933         return;
 934     for(list=0; list<2; list++){
 935         for(i=0; i<ref1->ref_count[list]; i++){
 936             const int poc = ref1->ref_poc[list][i];
 937             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 938             for(j=0; j<h->ref_count[list]; j++)
 939                 if(h->ref_list[list][j].poc == poc){
 940                     h->map_col_to_list0[list][i] = j;
 941                     break;
 942                 }
 943         }
 944     }
 945     if(FRAME_MBAFF){
 946         for(list=0; list<2; list++){
 947             for(i=0; i<ref1->ref_count[list]; i++){
 948                 j = h->map_col_to_list0[list][i];
 949                 h->map_col_to_list0_field[list][2*i] = 2*j;
 950                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 951             }
 952         }
 953     }
 954 }
 955
 956 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 957     MpegEncContext * const s = &h->s;
 958     const int mb_xy =   h->mb_xy;
 959     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 960     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 961     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 962     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 963     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 964     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 965     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 966     const int is_b8x8 = IS_8X8(*mb_type);
 967     unsigned int sub_mb_type;
 968     int i8, i4;
 969
 970 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 971     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 972         /* FIXME save sub mb types from previous frames (or derive from MVs)
 973          * so we know exactly what block size to use */
 974         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 975         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 976     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 977         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 978         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 979     }else{
 980         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 981         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 982     }
 983     if(!is_b8x8)
 984         *mb_type |= MB_TYPE_DIRECT2;
 985     if(MB_FIELD)
 986         *mb_type |= MB_TYPE_INTERLACED;
 987
 988     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 989
 990     if(h->direct_spatial_mv_pred){
 991         int ref[2];
 992         int mv[2][2];
 993         int list;
 994
 995         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 996
 997         /* ref = min(neighbors) */
 998         for(list=0; list<2; list++){
 999             int refa = h->ref_cache[list][scan8[0] - 1];
1000             int refb = h->ref_cache[list][scan8[0] - 8];
1001             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1002             if(refc == -2)
1003                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1004             ref[list] = refa;
1005             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1006                 ref[list] = refb;
1007             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1008                 ref[list] = refc;
1009             if(ref[list] < 0)
1010                 ref[list] = -1;
1011         }
1012
1013         if(ref[0] < 0 && ref[1] < 0){
1014             ref[0] = ref[1] = 0;
1015             mv[0][0] = mv[0][1] =
1016             mv[1][0] = mv[1][1] = 0;
1017         }else{
1018             for(list=0; list<2; list++){
1019                 if(ref[list] >= 0)
1020                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1021                 else
1022                     mv[list][0] = mv[list][1] = 0;
1023             }
1024         }
1025
1026         if(ref[1] < 0){
1027             if(!is_b8x8)
1028                 *mb_type &= ~MB_TYPE_L1;
1029             sub_mb_type &= ~MB_TYPE_L1;
1030         }else if(ref[0] < 0){
1031             if(!is_b8x8)
1032                 *mb_type &= ~MB_TYPE_L0;
1033             sub_mb_type &= ~MB_TYPE_L0;
1034         }
1035
1036         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1037             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1038             int mb_types_col[2];
1039             int b8_stride = h->b8_stride;
1040             int b4_stride = h->b_stride;
1041
1042             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1043
1044             if(IS_INTERLACED(*mb_type)){
1045                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1046                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1047                 if(s->mb_y&1){
1048                     l1ref0 -= 2*b8_stride;
1049                     l1ref1 -= 2*b8_stride;
1050                     l1mv0 -= 4*b4_stride;
1051                     l1mv1 -= 4*b4_stride;
1052                 }
1053                 b8_stride *= 3;
1054                 b4_stride *= 6;
1055             }else{
1056                 int cur_poc = s->current_picture_ptr->poc;
1057                 int *col_poc = h->ref_list[1]->field_poc;
1058                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1059                 int dy = 2*col_parity - (s->mb_y&1);
1060                 mb_types_col[0] =
1061                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1062                 l1ref0 += dy*b8_stride;
1063                 l1ref1 += dy*b8_stride;
1064                 l1mv0 += 2*dy*b4_stride;
1065                 l1mv1 += 2*dy*b4_stride;
1066                 b8_stride = 0;
1067             }
1068
1069             for(i8=0; i8<4; i8++){
1070                 int x8 = i8&1;
1071                 int y8 = i8>>1;
1072                 int xy8 = x8+y8*b8_stride;
1073                 int xy4 = 3*x8+y8*b4_stride;
1074                 int a=0, b=0;
1075
1076                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1077                     continue;
1078                 h->sub_mb_type[i8] = sub_mb_type;
1079
1080                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1081                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1082                 if(!IS_INTRA(mb_types_col[y8])
1083                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1084                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1085                     if(ref[0] > 0)
1086                         a= pack16to32(mv[0][0],mv[0][1]);
1087                     if(ref[1] > 0)
1088                         b= pack16to32(mv[1][0],mv[1][1]);
1089                 }else{
1090                     a= pack16to32(mv[0][0],mv[0][1]);
1091                     b= pack16to32(mv[1][0],mv[1][1]);
1092                 }
1093                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1094                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1095             }
1096         }else if(IS_16X16(*mb_type)){
1097             int a=0, b=0;
1098
1099             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1100             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1101             if(!IS_INTRA(mb_type_col)
1102                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1103                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1104                        && (h->x264_build>33 || !h->x264_build)))){
1105                 if(ref[0] > 0)
1106                     a= pack16to32(mv[0][0],mv[0][1]);
1107                 if(ref[1] > 0)
1108                     b= pack16to32(mv[1][0],mv[1][1]);
1109             }else{
1110                 a= pack16to32(mv[0][0],mv[0][1]);
1111                 b= pack16to32(mv[1][0],mv[1][1]);
1112             }
1113             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1114             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1115         }else{
1116             for(i8=0; i8<4; i8++){
1117                 const int x8 = i8&1;
1118                 const int y8 = i8>>1;
1119
1120                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1121                     continue;
1122                 h->sub_mb_type[i8] = sub_mb_type;
1123
1124                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1125                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1126                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1127                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1128
1129                 /* col_zero_flag */
1130                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1131                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1132                                                   && (h->x264_build>33 || !h->x264_build)))){
1133                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1134                     if(IS_SUB_8X8(sub_mb_type)){
1135                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1136                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1137                             if(ref[0] == 0)
1138                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1139                             if(ref[1] == 0)
1140                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1141                         }
1142                     }else
1143                     for(i4=0; i4<4; i4++){
1144                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1145                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1146                             if(ref[0] == 0)
1147                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1148                             if(ref[1] == 0)
1149                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1150                         }
1151                     }
1152                 }
1153             }
1154         }
1155     }else{ /* direct temporal mv pred */
1156         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1157         const int *dist_scale_factor = h->dist_scale_factor;
1158
1159         if(FRAME_MBAFF){
1160             if(IS_INTERLACED(*mb_type)){
1161                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1162                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1163                 dist_scale_factor = h->dist_scale_factor_field;
1164             }
1165             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1166                 /* FIXME assumes direct_8x8_inference == 1 */
1167                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1168                 int mb_types_col[2];
1169                 int y_shift;
1170
1171                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1172                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1173                          | (*mb_type & MB_TYPE_INTERLACED);
1174                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1175
1176                 if(IS_INTERLACED(*mb_type)){
1177                     /* frame to field scaling */
1178                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1179                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1180                     if(s->mb_y&1){
1181                         l1ref0 -= 2*h->b8_stride;
1182                         l1ref1 -= 2*h->b8_stride;
1183                         l1mv0 -= 4*h->b_stride;
1184                         l1mv1 -= 4*h->b_stride;
1185                     }
1186                     y_shift = 0;
1187
1188                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1189                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1190                        && !is_b8x8)
1191                         *mb_type |= MB_TYPE_16x8;
1192                     else
1193                         *mb_type |= MB_TYPE_8x8;
1194                 }else{
1195                     /* field to frame scaling */
1196                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1197                      * but in MBAFF, top and bottom POC are equal */
1198                     int dy = (s->mb_y&1) ? 1 : 2;
1199                     mb_types_col[0] =
1200                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1201                     l1ref0 += dy*h->b8_stride;
1202                     l1ref1 += dy*h->b8_stride;
1203                     l1mv0 += 2*dy*h->b_stride;
1204                     l1mv1 += 2*dy*h->b_stride;
1205                     y_shift = 2;
1206
1207                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1208                        && !is_b8x8)
1209                         *mb_type |= MB_TYPE_16x16;
1210                     else
1211                         *mb_type |= MB_TYPE_8x8;
1212                 }
1213
1214                 for(i8=0; i8<4; i8++){
1215                     const int x8 = i8&1;
1216                     const int y8 = i8>>1;
1217                     int ref0, scale;
1218                     const int16_t (*l1mv)[2]= l1mv0;
1219
1220                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1221                         continue;
1222                     h->sub_mb_type[i8] = sub_mb_type;
1223
1224                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1225                     if(IS_INTRA(mb_types_col[y8])){
1226                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1227                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1228                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1229                         continue;
1230                     }
1231
1232                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1233                     if(ref0 >= 0)
1234                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1235                     else{
1236                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1237                         l1mv= l1mv1;
1238                     }
1239                     scale = dist_scale_factor[ref0];
1240                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1241
1242                     {
1243                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1244                         int my_col = (mv_col[1]<<y_shift)/2;
1245                         int mx = (scale * mv_col[0] + 128) >> 8;
1246                         int my = (scale * my_col + 128) >> 8;
1247                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1248                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1249                     }
1250                 }
1251                 return;
1252             }
1253         }
1254
1255         /* one-to-one mv scaling */
1256
1257         if(IS_16X16(*mb_type)){
1258             int ref, mv0, mv1;
1259
1260             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1261             if(IS_INTRA(mb_type_col)){
1262                 ref=mv0=mv1=0;
1263             }else{
1264                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1265                                                 : map_col_to_list0[1][l1ref1[0]];
1266                 const int scale = dist_scale_factor[ref0];
1267                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1268                 int mv_l0[2];
1269                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1270                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1271                 ref= ref0;
1272                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1273                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1274             }
1275             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1276             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1277             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1278         }else{
1279             for(i8=0; i8<4; i8++){
1280                 const int x8 = i8&1;
1281                 const int y8 = i8>>1;
1282                 int ref0, scale;
1283                 const int16_t (*l1mv)[2]= l1mv0;
1284
1285                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1286                     continue;
1287                 h->sub_mb_type[i8] = sub_mb_type;
1288                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1289                 if(IS_INTRA(mb_type_col)){
1290                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1291                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1292                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1293                     continue;
1294                 }
1295
1296                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1297                 if(ref0 >= 0)
1298                     ref0 = map_col_to_list0[0][ref0];
1299                 else{
1300                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1301                     l1mv= l1mv1;
1302                 }
1303                 scale = dist_scale_factor[ref0];
1304
1305                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1306                 if(IS_SUB_8X8(sub_mb_type)){
1307                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1308                     int mx = (scale * mv_col[0] + 128) >> 8;
1309                     int my = (scale * mv_col[1] + 128) >> 8;
1310                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1311                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1312                 }else
1313                 for(i4=0; i4<4; i4++){
1314                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1315                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1316                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1317                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1318                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1319                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1320                 }
1321             }
1322         }
1323     }
1324 }
1325
1326 static inline void write_back_motion(H264Context *h, int mb_type){
1327     MpegEncContext * const s = &h->s;
1328     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1329     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1330     int list;
1331
1332     if(!USES_LIST(mb_type, 0))
1333         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1334
1335     for(list=0; list<h->list_count; list++){
1336         int y;
1337         if(!USES_LIST(mb_type, list))
1338             continue;
1339
1340         for(y=0; y<4; y++){
1341             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1342             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1343         }
1344         if( h->pps.cabac ) {
1345             if(IS_SKIP(mb_type))
1346                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1347             else
1348             for(y=0; y<4; y++){
1349                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1350                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1351             }
1352         }
1353
1354         {
1355             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1356             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1357             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1358             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1359             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1360         }
1361     }
1362
1363     if(h->slice_type == FF_B_TYPE && h->pps.cabac){
1364         if(IS_8X8(mb_type)){
1365             uint8_t *direct_table = &h->direct_table[b8_xy];
1366             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1367             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1368             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1369         }
1370     }
1371 }
1372
1373 /**
1374  * Decodes a network abstraction layer unit.
1375  * @param consumed is the number of bytes used as input
1376  * @param length is the length of the array
1377  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1378  * @returns decoded bytes, might be src+1 if no escapes
1379  */
1380 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1381     int i, si, di;
1382     uint8_t *dst;
1383     int bufidx;
1384
1385 //    src[0]&0x80;                //forbidden bit
1386     h->nal_ref_idc= src[0]>>5;
1387     h->nal_unit_type= src[0]&0x1F;
1388
1389     src++; length--;
1390 #if 0
1391     for(i=0; i<length; i++)
1392         printf("%2X ", src[i]);
1393 #endif
1394     for(i=0; i+1<length; i+=2){
1395         if(src[i]) continue;
1396         if(i>0 && src[i-1]==0) i--;
1397         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1398             if(src[i+2]!=3){
1399                 /* startcode, so we must be past the end */
1400                 length=i;
1401             }
1402             break;
1403         }
1404     }
1405
1406     if(i>=length-1){ //no escaped 0
1407         *dst_length= length;
1408         *consumed= length+1; //+1 for the header
1409         return src;
1410     }
1411
1412     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1413     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1414     dst= h->rbsp_buffer[bufidx];
1415
1416     if (dst == NULL){
1417         return NULL;
1418     }
1419
1420 //printf("decoding esc\n");
1421     si=di=0;
1422     while(si<length){
1423         //remove escapes (very rare 1:2^22)
1424         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1425             if(src[si+2]==3){ //escape
1426                 dst[di++]= 0;
1427                 dst[di++]= 0;
1428                 si+=3;
1429                 continue;
1430             }else //next start code
1431                 break;
1432         }
1433
1434         dst[di++]= src[si++];
1435     }
1436
1437     *dst_length= di;
1438     *consumed= si + 1;//+1 for the header
1439 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1440     return dst;
1441 }
1442
1443 /**
1444  * identifies the exact end of the bitstream
1445  * @return the length of the trailing, or 0 if damaged
1446  */
1447 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1448     int v= *src;
1449     int r;
1450
1451     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1452
1453     for(r=1; r<9; r++){
1454         if(v&1) return r;
1455         v>>=1;
1456     }
1457     return 0;
1458 }
1459
1460 /**
1461  * idct tranforms the 16 dc values and dequantize them.
1462  * @param qp quantization parameter
1463  */
1464 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1465 #define stride 16
1466     int i;
1467     int temp[16]; //FIXME check if this is a good idea
1468     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1469     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1470
1471 //memset(block, 64, 2*256);
1472 //return;
1473     for(i=0; i<4; i++){
1474         const int offset= y_offset[i];
1475         const int z0= block[offset+stride*0] + block[offset+stride*4];
1476         const int z1= block[offset+stride*0] - block[offset+stride*4];
1477         const int z2= block[offset+stride*1] - block[offset+stride*5];
1478         const int z3= block[offset+stride*1] + block[offset+stride*5];
1479
1480         temp[4*i+0]= z0+z3;
1481         temp[4*i+1]= z1+z2;
1482         temp[4*i+2]= z1-z2;
1483         temp[4*i+3]= z0-z3;
1484     }
1485
1486     for(i=0; i<4; i++){
1487         const int offset= x_offset[i];
1488         const int z0= temp[4*0+i] + temp[4*2+i];
1489         const int z1= temp[4*0+i] - temp[4*2+i];
1490         const int z2= temp[4*1+i] - temp[4*3+i];
1491         const int z3= temp[4*1+i] + temp[4*3+i];
1492
1493         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1494         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1495         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1496         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1497     }
1498 }
1499
1500 #if 0
1501 /**
1502  * dct tranforms the 16 dc values.
1503  * @param qp quantization parameter ??? FIXME
1504  */
1505 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1506 //    const int qmul= dequant_coeff[qp][0];
1507     int i;
1508     int temp[16]; //FIXME check if this is a good idea
1509     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1510     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1511
1512     for(i=0; i<4; i++){
1513         const int offset= y_offset[i];
1514         const int z0= block[offset+stride*0] + block[offset+stride*4];
1515         const int z1= block[offset+stride*0] - block[offset+stride*4];
1516         const int z2= block[offset+stride*1] - block[offset+stride*5];
1517         const int z3= block[offset+stride*1] + block[offset+stride*5];
1518
1519         temp[4*i+0]= z0+z3;
1520         temp[4*i+1]= z1+z2;
1521         temp[4*i+2]= z1-z2;
1522         temp[4*i+3]= z0-z3;
1523     }
1524
1525     for(i=0; i<4; i++){
1526         const int offset= x_offset[i];
1527         const int z0= temp[4*0+i] + temp[4*2+i];
1528         const int z1= temp[4*0+i] - temp[4*2+i];
1529         const int z2= temp[4*1+i] - temp[4*3+i];
1530         const int z3= temp[4*1+i] + temp[4*3+i];
1531
1532         block[stride*0 +offset]= (z0 + z3)>>1;
1533         block[stride*2 +offset]= (z1 + z2)>>1;
1534         block[stride*8 +offset]= (z1 - z2)>>1;
1535         block[stride*10+offset]= (z0 - z3)>>1;
1536     }
1537 }
1538 #endif
1539
1540 #undef xStride
1541 #undef stride
1542
1543 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1544     const int stride= 16*2;
1545     const int xStride= 16;
1546     int a,b,c,d,e;
1547
1548     a= block[stride*0 + xStride*0];
1549     b= block[stride*0 + xStride*1];
1550     c= block[stride*1 + xStride*0];
1551     d= block[stride*1 + xStride*1];
1552
1553     e= a-b;
1554     a= a+b;
1555     b= c-d;
1556     c= c+d;
1557
1558     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1559     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1560     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1561     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1562 }
1563
1564 #if 0
1565 static void chroma_dc_dct_c(DCTELEM *block){
1566     const int stride= 16*2;
1567     const int xStride= 16;
1568     int a,b,c,d,e;
1569
1570     a= block[stride*0 + xStride*0];
1571     b= block[stride*0 + xStride*1];
1572     c= block[stride*1 + xStride*0];
1573     d= block[stride*1 + xStride*1];
1574
1575     e= a-b;
1576     a= a+b;
1577     b= c-d;
1578     c= c+d;
1579
1580     block[stride*0 + xStride*0]= (a+c);
1581     block[stride*0 + xStride*1]= (e+b);
1582     block[stride*1 + xStride*0]= (a-c);
1583     block[stride*1 + xStride*1]= (e-b);
1584 }
1585 #endif
1586
1587 /**
1588  * gets the chroma qp.
1589  */
1590 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1591     return h->pps.chroma_qp_table[t][qscale & 0xff];
1592 }
1593
1594 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1595 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1596 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1597     int i;
1598     const int * const quant_table= quant_coeff[qscale];
1599     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1600     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1601     const unsigned int threshold2= (threshold1<<1);
1602     int last_non_zero;
1603
1604     if(separate_dc){
1605         if(qscale<=18){
1606             //avoid overflows
1607             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1608             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1609             const unsigned int dc_threshold2= (dc_threshold1<<1);
1610
1611             int level= block[0]*quant_coeff[qscale+18][0];
1612             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1613                 if(level>0){
1614                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1615                     block[0]= level;
1616                 }else{
1617                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1618                     block[0]= -level;
1619                 }
1620 //                last_non_zero = i;
1621             }else{
1622                 block[0]=0;
1623             }
1624         }else{
1625             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1626             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1627             const unsigned int dc_threshold2= (dc_threshold1<<1);
1628
1629             int level= block[0]*quant_table[0];
1630             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1631                 if(level>0){
1632                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1633                     block[0]= level;
1634                 }else{
1635                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1636                     block[0]= -level;
1637                 }
1638 //                last_non_zero = i;
1639             }else{
1640                 block[0]=0;
1641             }
1642         }
1643         last_non_zero= 0;
1644         i=1;
1645     }else{
1646         last_non_zero= -1;
1647         i=0;
1648     }
1649
1650     for(; i<16; i++){
1651         const int j= scantable[i];
1652         int level= block[j]*quant_table[j];
1653
1654 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1655 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1656         if(((unsigned)(level+threshold1))>threshold2){
1657             if(level>0){
1658                 level= (bias + level)>>QUANT_SHIFT;
1659                 block[j]= level;
1660             }else{
1661                 level= (bias - level)>>QUANT_SHIFT;
1662                 block[j]= -level;
1663             }
1664             last_non_zero = i;
1665         }else{
1666             block[j]=0;
1667         }
1668     }
1669
1670     return last_non_zero;
1671 }
1672
1673 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1674                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1675                            int src_x_offset, int src_y_offset,
1676                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1677     MpegEncContext * const s = &h->s;
1678     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1679     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1680     const int luma_xy= (mx&3) + ((my&3)<<2);
1681     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1682     uint8_t * src_cb, * src_cr;
1683     int extra_width= h->emu_edge_width;
1684     int extra_height= h->emu_edge_height;
1685     int emu=0;
1686     const int full_mx= mx>>2;
1687     const int full_my= my>>2;
1688     const int pic_width  = 16*s->mb_width;
1689     const int pic_height = 16*s->mb_height >> MB_FIELD;
1690
1691     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1692         return;
1693
1694     if(mx&7) extra_width -= 3;
1695     if(my&7) extra_height -= 3;
1696
1697     if(   full_mx < 0-extra_width
1698        || full_my < 0-extra_height
1699        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1700        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1701         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1702             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1703         emu=1;
1704     }
1705
1706     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1707     if(!square){
1708         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1709     }
1710
1711     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1712
1713     if(MB_FIELD){
1714         // chroma offset when predicting from a field of opposite parity
1715         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1716         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1717     }
1718     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1719     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1720
1721     if(emu){
1722         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1723             src_cb= s->edge_emu_buffer;
1724     }
1725     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1726
1727     if(emu){
1728         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1729             src_cr= s->edge_emu_buffer;
1730     }
1731     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1732 }
1733
1734 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1735                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1736                            int x_offset, int y_offset,
1737                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1738                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1739                            int list0, int list1){
1740     MpegEncContext * const s = &h->s;
1741     qpel_mc_func *qpix_op=  qpix_put;
1742     h264_chroma_mc_func chroma_op= chroma_put;
1743
1744     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1745     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1746     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1747     x_offset += 8*s->mb_x;
1748     y_offset += 8*(s->mb_y >> MB_FIELD);
1749
1750     if(list0){
1751         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1752         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1753                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1754                            qpix_op, chroma_op);
1755
1756         qpix_op=  qpix_avg;
1757         chroma_op= chroma_avg;
1758     }
1759
1760     if(list1){
1761         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1762         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1763                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1764                            qpix_op, chroma_op);
1765     }
1766 }
1767
1768 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1769                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1770                            int x_offset, int y_offset,
1771                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1772                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1773                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1774                            int list0, int list1){
1775     MpegEncContext * const s = &h->s;
1776
1777     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1778     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1779     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1780     x_offset += 8*s->mb_x;
1781     y_offset += 8*(s->mb_y >> MB_FIELD);
1782
1783     if(list0 && list1){
1784         /* don't optimize for luma-only case, since B-frames usually
1785          * use implicit weights => chroma too. */
1786         uint8_t *tmp_cb = s->obmc_scratchpad;
1787         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1788         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1789         int refn0 = h->ref_cache[0][ scan8[n] ];
1790         int refn1 = h->ref_cache[1][ scan8[n] ];
1791
1792         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1793                     dest_y, dest_cb, dest_cr,
1794                     x_offset, y_offset, qpix_put, chroma_put);
1795         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1796                     tmp_y, tmp_cb, tmp_cr,
1797                     x_offset, y_offset, qpix_put, chroma_put);
1798
1799         if(h->use_weight == 2){
1800             int weight0 = h->implicit_weight[refn0][refn1];
1801             int weight1 = 64 - weight0;
1802             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1803             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1804             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1805         }else{
1806             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1807                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1808                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1809             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1810                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1811                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1812             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1813                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1814                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1815         }
1816     }else{
1817         int list = list1 ? 1 : 0;
1818         int refn = h->ref_cache[list][ scan8[n] ];
1819         Picture *ref= &h->ref_list[list][refn];
1820         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1821                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1822                     qpix_put, chroma_put);
1823
1824         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1825                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1826         if(h->use_weight_chroma){
1827             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1828                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1829             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1830                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1831         }
1832     }
1833 }
1834
1835 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1836                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1837                            int x_offset, int y_offset,
1838                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1839                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1840                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1841                            int list0, int list1){
1842     if((h->use_weight==2 && list0 && list1
1843         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1844        || h->use_weight==1)
1845         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                          x_offset, y_offset, qpix_put, chroma_put,
1847                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1848     else
1849         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1850                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1851 }
1852
1853 static inline void prefetch_motion(H264Context *h, int list){
1854     /* fetch pixels for estimated mv 4 macroblocks ahead
1855      * optimized for 64byte cache lines */
1856     MpegEncContext * const s = &h->s;
1857     const int refn = h->ref_cache[list][scan8[0]];
1858     if(refn >= 0){
1859         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1860         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1861         uint8_t **src= h->ref_list[list][refn].data;
1862         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1863         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1864         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1865         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1866     }
1867 }
1868
1869 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1870                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1871                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1872                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1873     MpegEncContext * const s = &h->s;
1874     const int mb_xy= h->mb_xy;
1875     const int mb_type= s->current_picture.mb_type[mb_xy];
1876
1877     assert(IS_INTER(mb_type));
1878
1879     prefetch_motion(h, 0);
1880
1881     if(IS_16X16(mb_type)){
1882         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1883                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1884                 &weight_op[0], &weight_avg[0],
1885                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1886     }else if(IS_16X8(mb_type)){
1887         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1891         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1892                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1893                 &weight_op[1], &weight_avg[1],
1894                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1895     }else if(IS_8X16(mb_type)){
1896         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1900         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1901                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1902                 &weight_op[2], &weight_avg[2],
1903                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1904     }else{
1905         int i;
1906
1907         assert(IS_8X8(mb_type));
1908
1909         for(i=0; i<4; i++){
1910             const int sub_mb_type= h->sub_mb_type[i];
1911             const int n= 4*i;
1912             int x_offset= (i&1)<<2;
1913             int y_offset= (i&2)<<1;
1914
1915             if(IS_SUB_8X8(sub_mb_type)){
1916                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1917                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1918                     &weight_op[3], &weight_avg[3],
1919                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1920             }else if(IS_SUB_8X4(sub_mb_type)){
1921                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1926                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1927                     &weight_op[4], &weight_avg[4],
1928                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1929             }else if(IS_SUB_4X8(sub_mb_type)){
1930                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1935                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1936                     &weight_op[5], &weight_avg[5],
1937                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1938             }else{
1939                 int j;
1940                 assert(IS_SUB_4X4(sub_mb_type));
1941                 for(j=0; j<4; j++){
1942                     int sub_x_offset= x_offset + 2*(j&1);
1943                     int sub_y_offset= y_offset +   (j&2);
1944                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1945                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1946                         &weight_op[6], &weight_avg[6],
1947                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1948                 }
1949             }
1950         }
1951     }
1952
1953     prefetch_motion(h, 1);
1954 }
1955
1956 static av_cold void decode_init_vlc(void){
1957     static int done = 0;
1958
1959     if (!done) {
1960         int i;
1961         done = 1;
1962
1963         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1964                  &chroma_dc_coeff_token_len [0], 1, 1,
1965                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1966
1967         for(i=0; i<4; i++){
1968             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1969                      &coeff_token_len [i][0], 1, 1,
1970                      &coeff_token_bits[i][0], 1, 1, 1);
1971         }
1972
1973         for(i=0; i<3; i++){
1974             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1975                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1976                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1977         }
1978         for(i=0; i<15; i++){
1979             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1980                      &total_zeros_len [i][0], 1, 1,
1981                      &total_zeros_bits[i][0], 1, 1, 1);
1982         }
1983
1984         for(i=0; i<6; i++){
1985             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1986                      &run_len [i][0], 1, 1,
1987                      &run_bits[i][0], 1, 1, 1);
1988         }
1989         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1990                  &run_len [6][0], 1, 1,
1991                  &run_bits[6][0], 1, 1, 1);
1992     }
1993 }
1994
1995 static void free_tables(H264Context *h){
1996     int i;
1997     H264Context *hx;
1998     av_freep(&h->intra4x4_pred_mode);
1999     av_freep(&h->chroma_pred_mode_table);
2000     av_freep(&h->cbp_table);
2001     av_freep(&h->mvd_table[0]);
2002     av_freep(&h->mvd_table[1]);
2003     av_freep(&h->direct_table);
2004     av_freep(&h->non_zero_count);
2005     av_freep(&h->slice_table_base);
2006     h->slice_table= NULL;
2007
2008     av_freep(&h->mb2b_xy);
2009     av_freep(&h->mb2b8_xy);
2010
2011     for(i = 0; i < MAX_SPS_COUNT; i++)
2012         av_freep(h->sps_buffers + i);
2013
2014     for(i = 0; i < MAX_PPS_COUNT; i++)
2015         av_freep(h->pps_buffers + i);
2016
2017     for(i = 0; i < h->s.avctx->thread_count; i++) {
2018         hx = h->thread_context[i];
2019         if(!hx) continue;
2020         av_freep(&hx->top_borders[1]);
2021         av_freep(&hx->top_borders[0]);
2022         av_freep(&hx->s.obmc_scratchpad);
2023     }
2024 }
2025
2026 static void init_dequant8_coeff_table(H264Context *h){
2027     int i,q,x;
2028     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2029     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2030     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2031
2032     for(i=0; i<2; i++ ){
2033         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2034             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2035             break;
2036         }
2037
2038         for(q=0; q<52; q++){
2039             int shift = ff_div6[q];
2040             int idx = ff_rem6[q];
2041             for(x=0; x<64; x++)
2042                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2043                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2044                     h->pps.scaling_matrix8[i][x]) << shift;
2045         }
2046     }
2047 }
2048
2049 static void init_dequant4_coeff_table(H264Context *h){
2050     int i,j,q,x;
2051     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2052     for(i=0; i<6; i++ ){
2053         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2054         for(j=0; j<i; j++){
2055             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2056                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2057                 break;
2058             }
2059         }
2060         if(j<i)
2061             continue;
2062
2063         for(q=0; q<52; q++){
2064             int shift = ff_div6[q] + 2;
2065             int idx = ff_rem6[q];
2066             for(x=0; x<16; x++)
2067                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2068                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2069                     h->pps.scaling_matrix4[i][x]) << shift;
2070         }
2071     }
2072 }
2073
2074 static void init_dequant_tables(H264Context *h){
2075     int i,x;
2076     init_dequant4_coeff_table(h);
2077     if(h->pps.transform_8x8_mode)
2078         init_dequant8_coeff_table(h);
2079     if(h->sps.transform_bypass){
2080         for(i=0; i<6; i++)
2081             for(x=0; x<16; x++)
2082                 h->dequant4_coeff[i][0][x] = 1<<6;
2083         if(h->pps.transform_8x8_mode)
2084             for(i=0; i<2; i++)
2085                 for(x=0; x<64; x++)
2086                     h->dequant8_coeff[i][0][x] = 1<<6;
2087     }
2088 }
2089
2090
2091 /**
2092  * allocates tables.
2093  * needs width/height
2094  */
2095 static int alloc_tables(H264Context *h){
2096     MpegEncContext * const s = &h->s;
2097     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2098     int x,y;
2099
2100     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2101
2102     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2105
2106     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2107     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2108     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2109     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2110
2111     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2112     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2113
2114     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2115     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2116     for(y=0; y<s->mb_height; y++){
2117         for(x=0; x<s->mb_width; x++){
2118             const int mb_xy= x + y*s->mb_stride;
2119             const int b_xy = 4*x + 4*y*h->b_stride;
2120             const int b8_xy= 2*x + 2*y*h->b8_stride;
2121
2122             h->mb2b_xy [mb_xy]= b_xy;
2123             h->mb2b8_xy[mb_xy]= b8_xy;
2124         }
2125     }
2126
2127     s->obmc_scratchpad = NULL;
2128
2129     if(!h->dequant4_coeff[0])
2130         init_dequant_tables(h);
2131
2132     return 0;
2133 fail:
2134     free_tables(h);
2135     return -1;
2136 }
2137
2138 /**
2139  * Mimic alloc_tables(), but for every context thread.
2140  */
2141 static void clone_tables(H264Context *dst, H264Context *src){
2142     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2143     dst->non_zero_count           = src->non_zero_count;
2144     dst->slice_table              = src->slice_table;
2145     dst->cbp_table                = src->cbp_table;
2146     dst->mb2b_xy                  = src->mb2b_xy;
2147     dst->mb2b8_xy                 = src->mb2b8_xy;
2148     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2149     dst->mvd_table[0]             = src->mvd_table[0];
2150     dst->mvd_table[1]             = src->mvd_table[1];
2151     dst->direct_table             = src->direct_table;
2152
2153     dst->s.obmc_scratchpad = NULL;
2154     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2155 }
2156
2157 /**
2158  * Init context
2159  * Allocate buffers which are not shared amongst multiple threads.
2160  */
2161 static int context_init(H264Context *h){
2162     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2163     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2164
2165     return 0;
2166 fail:
2167     return -1; // free_tables will clean up for us
2168 }
2169
2170 static av_cold void common_init(H264Context *h){
2171     MpegEncContext * const s = &h->s;
2172
2173     s->width = s->avctx->width;
2174     s->height = s->avctx->height;
2175     s->codec_id= s->avctx->codec->id;
2176
2177     ff_h264_pred_init(&h->hpc, s->codec_id);
2178
2179     h->dequant_coeff_pps= -1;
2180     s->unrestricted_mv=1;
2181     s->decode=1; //FIXME
2182
2183     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2184     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2185 }
2186
2187 static av_cold int decode_init(AVCodecContext *avctx){
2188     H264Context *h= avctx->priv_data;
2189     MpegEncContext * const s = &h->s;
2190
2191     MPV_decode_defaults(s);
2192
2193     s->avctx = avctx;
2194     common_init(h);
2195
2196     s->out_format = FMT_H264;
2197     s->workaround_bugs= avctx->workaround_bugs;
2198
2199     // set defaults
2200 //    s->decode_mb= ff_h263_decode_mb;
2201     s->quarter_sample = 1;
2202     s->low_delay= 1;
2203     avctx->pix_fmt= PIX_FMT_YUV420P;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     return 0;
2217 }
2218
2219 static int frame_start(H264Context *h){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222
2223     if(MPV_frame_start(s, s->avctx) < 0)
2224         return -1;
2225     ff_er_frame_start(s);
2226     /*
2227      * MPV_frame_start uses pict_type to derive key_frame.
2228      * This is incorrect for H.264; IDR markings must be used.
2229      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2230      * See decode_nal_units().
2231      */
2232     s->current_picture_ptr->key_frame= 0;
2233
2234     assert(s->linesize && s->uvlinesize);
2235
2236     for(i=0; i<16; i++){
2237         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2239     }
2240     for(i=0; i<4; i++){
2241         h->block_offset[16+i]=
2242         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243         h->block_offset[24+16+i]=
2244         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245     }
2246
2247     /* can't be in alloc_tables because linesize isn't known there.
2248      * FIXME: redo bipred weight to not require extra buffer? */
2249     for(i = 0; i < s->avctx->thread_count; i++)
2250         if(!h->thread_context[i]->s.obmc_scratchpad)
2251             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2252
2253     /* some macroblocks will be accessed before they're available */
2254     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2255         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2256
2257 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2258     return 0;
2259 }
2260
2261 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2262     MpegEncContext * const s = &h->s;
2263     int i;
2264
2265     src_y  -=   linesize;
2266     src_cb -= uvlinesize;
2267     src_cr -= uvlinesize;
2268
2269     // There are two lines saved, the line above the the top macroblock of a pair,
2270     // and the line above the bottom macroblock
2271     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2272     for(i=1; i<17; i++){
2273         h->left_border[i]= src_y[15+i*  linesize];
2274     }
2275
2276     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2277     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2278
2279     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2280         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2281         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2282         for(i=1; i<9; i++){
2283             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2284             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2285         }
2286         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2287         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2288     }
2289 }
2290
2291 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2292     MpegEncContext * const s = &h->s;
2293     int temp8, i;
2294     uint64_t temp64;
2295     int deblock_left;
2296     int deblock_top;
2297     int mb_xy;
2298
2299     if(h->deblocking_filter == 2) {
2300         mb_xy = h->mb_xy;
2301         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2302         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2303     } else {
2304         deblock_left = (s->mb_x > 0);
2305         deblock_top =  (s->mb_y > 0);
2306     }
2307
2308     src_y  -=   linesize + 1;
2309     src_cb -= uvlinesize + 1;
2310     src_cr -= uvlinesize + 1;
2311
2312 #define XCHG(a,b,t,xchg)\
2313 t= a;\
2314 if(xchg)\
2315     a= b;\
2316 b= t;
2317
2318     if(deblock_left){
2319         for(i = !deblock_top; i<17; i++){
2320             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2321         }
2322     }
2323
2324     if(deblock_top){
2325         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2326         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2327         if(s->mb_x+1 < s->mb_width){
2328             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2329         }
2330     }
2331
2332     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2333         if(deblock_left){
2334             for(i = !deblock_top; i<9; i++){
2335                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2336                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2337             }
2338         }
2339         if(deblock_top){
2340             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2341             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2342         }
2343     }
2344 }
2345
2346 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2347     MpegEncContext * const s = &h->s;
2348     int i;
2349
2350     src_y  -= 2 *   linesize;
2351     src_cb -= 2 * uvlinesize;
2352     src_cr -= 2 * uvlinesize;
2353
2354     // There are two lines saved, the line above the the top macroblock of a pair,
2355     // and the line above the bottom macroblock
2356     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2357     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2358     for(i=2; i<34; i++){
2359         h->left_border[i]= src_y[15+i*  linesize];
2360     }
2361
2362     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2363     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2364     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2365     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2366
2367     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2368         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2369         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2370         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2371         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2372         for(i=2; i<18; i++){
2373             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2374             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2375         }
2376         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2377         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2378         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2379         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2380     }
2381 }
2382
2383 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2384     MpegEncContext * const s = &h->s;
2385     int temp8, i;
2386     uint64_t temp64;
2387     int deblock_left = (s->mb_x > 0);
2388     int deblock_top  = (s->mb_y > 1);
2389
2390     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2391
2392     src_y  -= 2 *   linesize + 1;
2393     src_cb -= 2 * uvlinesize + 1;
2394     src_cr -= 2 * uvlinesize + 1;
2395
2396 #define XCHG(a,b,t,xchg)\
2397 t= a;\
2398 if(xchg)\
2399     a= b;\
2400 b= t;
2401
2402     if(deblock_left){
2403         for(i = (!deblock_top)<<1; i<34; i++){
2404             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2405         }
2406     }
2407
2408     if(deblock_top){
2409         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2410         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2411         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2412         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2413         if(s->mb_x+1 < s->mb_width){
2414             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2415             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2416         }
2417     }
2418
2419     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2420         if(deblock_left){
2421             for(i = (!deblock_top) << 1; i<18; i++){
2422                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2423                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2424             }
2425         }
2426         if(deblock_top){
2427             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2428             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2429             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2430             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2431         }
2432     }
2433 }
2434
2435 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2436     MpegEncContext * const s = &h->s;
2437     const int mb_x= s->mb_x;
2438     const int mb_y= s->mb_y;
2439     const int mb_xy= h->mb_xy;
2440     const int mb_type= s->current_picture.mb_type[mb_xy];
2441     uint8_t  *dest_y, *dest_cb, *dest_cr;
2442     int linesize, uvlinesize /*dct_offset*/;
2443     int i;
2444     int *block_offset = &h->block_offset[0];
2445     const unsigned int bottom = mb_y & 1;
2446     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2447     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2448     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2449
2450     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2451     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2452     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2453
2454     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2455     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2456
2457     if (!simple && MB_FIELD) {
2458         linesize   = h->mb_linesize   = s->linesize * 2;
2459         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2460         block_offset = &h->block_offset[24];
2461         if(mb_y&1){ //FIXME move out of this func?
2462             dest_y -= s->linesize*15;
2463             dest_cb-= s->uvlinesize*7;
2464             dest_cr-= s->uvlinesize*7;
2465         }
2466         if(FRAME_MBAFF) {
2467             int list;
2468             for(list=0; list<h->list_count; list++){
2469                 if(!USES_LIST(mb_type, list))
2470                     continue;
2471                 if(IS_16X16(mb_type)){
2472                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2473                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2474                 }else{
2475                     for(i=0; i<16; i+=4){
2476                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2477                         int ref = h->ref_cache[list][scan8[i]];
2478                         if(ref >= 0)
2479                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2480                     }
2481                 }
2482             }
2483         }
2484     } else {
2485         linesize   = h->mb_linesize   = s->linesize;
2486         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2487 //        dct_offset = s->linesize * 16;
2488     }
2489
2490     if(transform_bypass){
2491         idct_dc_add =
2492         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2493     }else if(IS_8x8DCT(mb_type)){
2494         idct_dc_add = s->dsp.h264_idct8_dc_add;
2495         idct_add = s->dsp.h264_idct8_add;
2496     }else{
2497         idct_dc_add = s->dsp.h264_idct_dc_add;
2498         idct_add = s->dsp.h264_idct_add;
2499     }
2500
2501     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2502        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2503         int mbt_y = mb_y&~1;
2504         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2505         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2506         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2507         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2508     }
2509
2510     if (!simple && IS_INTRA_PCM(mb_type)) {
2511         unsigned int x, y;
2512
2513         // The pixels are stored in h->mb array in the same order as levels,
2514         // copy them in output in the correct order.
2515         for(i=0; i<16; i++) {
2516             for (y=0; y<4; y++) {
2517                 for (x=0; x<4; x++) {
2518                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2519                 }
2520             }
2521         }
2522         for(i=16; i<16+4; i++) {
2523             for (y=0; y<4; y++) {
2524                 for (x=0; x<4; x++) {
2525                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2526                 }
2527             }
2528         }
2529         for(i=20; i<20+4; i++) {
2530             for (y=0; y<4; y++) {
2531                 for (x=0; x<4; x++) {
2532                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2533                 }
2534             }
2535         }
2536     } else {
2537         if(IS_INTRA(mb_type)){
2538             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2539                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2540
2541             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2542                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2543                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2544             }
2545
2546             if(IS_INTRA4x4(mb_type)){
2547                 if(simple || !s->encoding){
2548                     if(IS_8x8DCT(mb_type)){
2549                         for(i=0; i<16; i+=4){
2550                             uint8_t * const ptr= dest_y + block_offset[i];
2551                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2552                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2553                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2554                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2555                             if(nnz){
2556                                 if(nnz == 1 && h->mb[i*16])
2557                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2558                                 else
2559                                     idct_add(ptr, h->mb + i*16, linesize);
2560                             }
2561                         }
2562                     }else
2563                     for(i=0; i<16; i++){
2564                         uint8_t * const ptr= dest_y + block_offset[i];
2565                         uint8_t *topright;
2566                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2567                         int nnz, tr;
2568
2569                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2570                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2571                             assert(mb_y || linesize <= block_offset[i]);
2572                             if(!topright_avail){
2573                                 tr= ptr[3 - linesize]*0x01010101;
2574                                 topright= (uint8_t*) &tr;
2575                             }else
2576                                 topright= ptr + 4 - linesize;
2577                         }else
2578                             topright= NULL;
2579
2580                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2581                         nnz = h->non_zero_count_cache[ scan8[i] ];
2582                         if(nnz){
2583                             if(is_h264){
2584                                 if(nnz == 1 && h->mb[i*16])
2585                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2586                                 else
2587                                     idct_add(ptr, h->mb + i*16, linesize);
2588                             }else
2589                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2590                         }
2591                     }
2592                 }
2593             }else{
2594                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2595                 if(is_h264){
2596                     if(!transform_bypass)
2597                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2598                 }else
2599                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2600             }
2601             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2602                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2603         }else if(is_h264){
2604             hl_motion(h, dest_y, dest_cb, dest_cr,
2605                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2606                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2607                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2608         }
2609
2610
2611         if(!IS_INTRA4x4(mb_type)){
2612             if(is_h264){
2613                 if(IS_INTRA16x16(mb_type)){
2614                     for(i=0; i<16; i++){
2615                         if(h->non_zero_count_cache[ scan8[i] ])
2616                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2617                         else if(h->mb[i*16])
2618                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2619                     }
2620                 }else{
2621                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2622                     for(i=0; i<16; i+=di){
2623                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2624                         if(nnz){
2625                             if(nnz==1 && h->mb[i*16])
2626                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2627                             else
2628                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2629                         }
2630                     }
2631                 }
2632             }else{
2633                 for(i=0; i<16; i++){
2634                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2635                         uint8_t * const ptr= dest_y + block_offset[i];
2636                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2637                     }
2638                 }
2639             }
2640         }
2641
2642         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2643             uint8_t *dest[2] = {dest_cb, dest_cr};
2644             if(transform_bypass){
2645                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2646             }else{
2647                 idct_add = s->dsp.h264_idct_add;
2648                 idct_dc_add = s->dsp.h264_idct_dc_add;
2649                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2650                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2651             }
2652             if(is_h264){
2653                 for(i=16; i<16+8; i++){
2654                     if(h->non_zero_count_cache[ scan8[i] ])
2655                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2656                     else if(h->mb[i*16])
2657                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2658                 }
2659             }else{
2660                 for(i=16; i<16+8; i++){
2661                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2662                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2663                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2664                     }
2665                 }
2666             }
2667         }
2668     }
2669     if(h->deblocking_filter) {
2670         if (!simple && FRAME_MBAFF) {
2671             //FIXME try deblocking one mb at a time?
2672             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2673             const int mb_y = s->mb_y - 1;
2674             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2675             const int mb_xy= mb_x + mb_y*s->mb_stride;
2676             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2677             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2678             if (!bottom) return;
2679             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2680             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2681             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2682
2683             if(IS_INTRA(mb_type_top | mb_type_bottom))
2684                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2685
2686             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2687             // deblock a pair
2688             // top
2689             s->mb_y--; h->mb_xy -= s->mb_stride;
2690             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2691             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2692             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2693             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2694             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2695             // bottom
2696             s->mb_y++; h->mb_xy += s->mb_stride;
2697             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2698             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2699             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2700             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2701             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2702         } else {
2703             tprintf(h->s.avctx, "call filter_mb\n");
2704             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2705             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2706             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2707         }
2708     }
2709 }
2710
2711 /**
2712  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2713  */
2714 static void hl_decode_mb_simple(H264Context *h){
2715     hl_decode_mb_internal(h, 1);
2716 }
2717
2718 /**
2719  * Process a macroblock; this handles edge cases, such as interlacing.
2720  */
2721 static void av_noinline hl_decode_mb_complex(H264Context *h){
2722     hl_decode_mb_internal(h, 0);
2723 }
2724
2725 static void hl_decode_mb(H264Context *h){
2726     MpegEncContext * const s = &h->s;
2727     const int mb_xy= h->mb_xy;
2728     const int mb_type= s->current_picture.mb_type[mb_xy];
2729     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2730                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2731
2732     if(ENABLE_H264_ENCODER && !s->decode)
2733         return;
2734
2735     if (is_complex)
2736         hl_decode_mb_complex(h);
2737     else hl_decode_mb_simple(h);
2738 }
2739
2740 static void pic_as_field(Picture *pic, const int parity){
2741     int i;
2742     for (i = 0; i < 4; ++i) {
2743         if (parity == PICT_BOTTOM_FIELD)
2744             pic->data[i] += pic->linesize[i];
2745         pic->reference = parity;
2746         pic->linesize[i] *= 2;
2747     }
2748 }
2749
2750 static int split_field_copy(Picture *dest, Picture *src,
2751                             int parity, int id_add){
2752     int match = !!(src->reference & parity);
2753
2754     if (match) {
2755         *dest = *src;
2756         pic_as_field(dest, parity);
2757         dest->pic_id *= 2;
2758         dest->pic_id += id_add;
2759     }
2760
2761     return match;
2762 }
2763
2764 /**
2765  * Split one reference list into field parts, interleaving by parity
2766  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2767  * set to look at the actual start of data for that field.
2768  *
2769  * @param dest output list
2770  * @param dest_len maximum number of fields to put in dest
2771  * @param src the source reference list containing fields and/or field pairs
2772  *            (aka short_ref/long_ref, or
2773  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2774  * @param src_len number of Picture's in source (pairs and unmatched fields)
2775  * @param parity the parity of the picture being decoded/needing
2776  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2777  * @return number of fields placed in dest
2778  */
2779 static int split_field_half_ref_list(Picture *dest, int dest_len,
2780                                      Picture *src,  int src_len,  int parity){
2781     int same_parity   = 1;
2782     int same_i        = 0;
2783     int opp_i         = 0;
2784     int out_i;
2785     int field_output;
2786
2787     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2788         if (same_parity && same_i < src_len) {
2789             field_output = split_field_copy(dest + out_i, src + same_i,
2790                                             parity, 1);
2791             same_parity = !field_output;
2792             same_i++;
2793
2794         } else if (opp_i < src_len) {
2795             field_output = split_field_copy(dest + out_i, src + opp_i,
2796                                             PICT_FRAME - parity, 0);
2797             same_parity = field_output;
2798             opp_i++;
2799
2800         } else {
2801             break;
2802         }
2803     }
2804
2805     return out_i;
2806 }
2807
2808 /**
2809  * Split the reference frame list into a reference field list.
2810  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2811  * The input list contains both reference field pairs and
2812  * unmatched reference fields; it is ordered as spec describes
2813  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2814  * unmatched field pairs are also present. Conceptually this is equivalent
2815  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2816  *
2817  * @param dest output reference list where ordered fields are to be placed
2818  * @param dest_len max number of fields to place at dest
2819  * @param src source reference list, as described above
2820  * @param src_len number of pictures (pairs and unmatched fields) in src
2821  * @param parity parity of field being currently decoded
2822  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2823  * @param long_i index into src array that holds first long reference picture,
2824  *        or src_len if no long refs present.
2825  */
2826 static int split_field_ref_list(Picture *dest, int dest_len,
2827                                 Picture *src,  int src_len,
2828                                 int parity,    int long_i){
2829
2830     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2831     dest += i;
2832     dest_len -= i;
2833
2834     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2835                                    src_len - long_i, parity);
2836     return i;
2837 }
2838
2839 /**
2840  * fills the default_ref_list.
2841  */
2842 static int fill_default_ref_list(H264Context *h){
2843     MpegEncContext * const s = &h->s;
2844     int i;
2845     int smallest_poc_greater_than_current = -1;
2846     int structure_sel;
2847     Picture sorted_short_ref[32];
2848     Picture field_entry_list[2][32];
2849     Picture *frame_list[2];
2850
2851     if (FIELD_PICTURE) {
2852         structure_sel = PICT_FRAME;
2853         frame_list[0] = field_entry_list[0];
2854         frame_list[1] = field_entry_list[1];
2855     } else {
2856         structure_sel = 0;
2857         frame_list[0] = h->default_ref_list[0];
2858         frame_list[1] = h->default_ref_list[1];
2859     }
2860
2861     if(h->slice_type==FF_B_TYPE){
2862         int list;
2863         int len[2];
2864         int short_len[2];
2865         int out_i;
2866         int limit= INT_MIN;
2867
2868         /* sort frame according to poc in B slice */
2869         for(out_i=0; out_i<h->short_ref_count; out_i++){
2870             int best_i=INT_MIN;
2871             int best_poc=INT_MAX;
2872
2873             for(i=0; i<h->short_ref_count; i++){
2874                 const int poc= h->short_ref[i]->poc;
2875                 if(poc > limit && poc < best_poc){
2876                     best_poc= poc;
2877                     best_i= i;
2878                 }
2879             }
2880
2881             assert(best_i != INT_MIN);
2882
2883             limit= best_poc;
2884             sorted_short_ref[out_i]= *h->short_ref[best_i];
2885             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2886             if (-1 == smallest_poc_greater_than_current) {
2887                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2888                     smallest_poc_greater_than_current = out_i;
2889                 }
2890             }
2891         }
2892
2893         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2894
2895         // find the largest poc
2896         for(list=0; list<2; list++){
2897             int index = 0;
2898             int j= -99;
2899             int step= list ? -1 : 1;
2900
2901             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2902                 int sel;
2903                 while(j<0 || j>= h->short_ref_count){
2904                     if(j != -99 && step == (list ? -1 : 1))
2905                         return -1;
2906                     step = -step;
2907                     j= smallest_poc_greater_than_current + (step>>1);
2908                 }
2909                 sel = sorted_short_ref[j].reference | structure_sel;
2910                 if(sel != PICT_FRAME) continue;
2911                 frame_list[list][index  ]= sorted_short_ref[j];
2912                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2913             }
2914             short_len[list] = index;
2915
2916             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2917                 int sel;
2918                 if(h->long_ref[i] == NULL) continue;
2919                 sel = h->long_ref[i]->reference | structure_sel;
2920                 if(sel != PICT_FRAME) continue;
2921
2922                 frame_list[ list ][index  ]= *h->long_ref[i];
2923                 frame_list[ list ][index++].pic_id= i;
2924             }
2925             len[list] = index;
2926         }
2927
2928         for(list=0; list<2; list++){
2929             if (FIELD_PICTURE)
2930                 len[list] = split_field_ref_list(h->default_ref_list[list],
2931                                                  h->ref_count[list],
2932                                                  frame_list[list],
2933                                                  len[list],
2934                                                  s->picture_structure,
2935                                                  short_len[list]);
2936
2937             // swap the two first elements of L1 when L0 and L1 are identical
2938             if(list && len[0] > 1 && len[0] == len[1])
2939                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2940                     if(i == len[0]){
2941                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2942                         break;
2943                     }
2944
2945             if(len[list] < h->ref_count[ list ])
2946                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2947         }
2948
2949
2950     }else{
2951         int index=0;
2952         int short_len;
2953         for(i=0; i<h->short_ref_count; i++){
2954             int sel;
2955             sel = h->short_ref[i]->reference | structure_sel;
2956             if(sel != PICT_FRAME) continue;
2957             frame_list[0][index  ]= *h->short_ref[i];
2958             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2959         }
2960         short_len = index;
2961         for(i = 0; i < 16; i++){
2962             int sel;
2963             if(h->long_ref[i] == NULL) continue;
2964             sel = h->long_ref[i]->reference | structure_sel;
2965             if(sel != PICT_FRAME) continue;
2966             frame_list[0][index  ]= *h->long_ref[i];
2967             frame_list[0][index++].pic_id= i;
2968         }
2969
2970         if (FIELD_PICTURE)
2971             index = split_field_ref_list(h->default_ref_list[0],
2972                                          h->ref_count[0], frame_list[0],
2973                                          index, s->picture_structure,
2974                                          short_len);
2975
2976         if(index < h->ref_count[0])
2977             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2978     }
2979 #ifdef TRACE
2980     for (i=0; i<h->ref_count[0]; i++) {
2981         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2982     }
2983     if(h->slice_type==FF_B_TYPE){
2984         for (i=0; i<h->ref_count[1]; i++) {
2985             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2986         }
2987     }
2988 #endif
2989     return 0;
2990 }
2991
2992 static void print_short_term(H264Context *h);
2993 static void print_long_term(H264Context *h);
2994
2995 /**
2996  * Extract structure information about the picture described by pic_num in
2997  * the current decoding context (frame or field). Note that pic_num is
2998  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2999  * @param pic_num picture number for which to extract structure information
3000  * @param structure one of PICT_XXX describing structure of picture
3001  *                      with pic_num
3002  * @return frame number (short term) or long term index of picture
3003  *         described by pic_num
3004  */
3005 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3006     MpegEncContext * const s = &h->s;
3007
3008     *structure = s->picture_structure;
3009     if(FIELD_PICTURE){
3010         if (!(pic_num & 1))
3011             /* opposite field */
3012             *structure ^= PICT_FRAME;
3013         pic_num >>= 1;
3014     }
3015
3016     return pic_num;
3017 }
3018
3019 static int decode_ref_pic_list_reordering(H264Context *h){
3020     MpegEncContext * const s = &h->s;
3021     int list, index, pic_structure;
3022
3023     print_short_term(h);
3024     print_long_term(h);
3025     if(h->slice_type==FF_I_TYPE || h->slice_type==FF_SI_TYPE) return 0; //FIXME move before func
3026
3027     for(list=0; list<h->list_count; list++){
3028         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3029
3030         if(get_bits1(&s->gb)){
3031             int pred= h->curr_pic_num;
3032
3033             for(index=0; ; index++){
3034                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3035                 unsigned int pic_id;
3036                 int i;
3037                 Picture *ref = NULL;
3038
3039                 if(reordering_of_pic_nums_idc==3)
3040                     break;
3041
3042                 if(index >= h->ref_count[list]){
3043                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3044                     return -1;
3045                 }
3046
3047                 if(reordering_of_pic_nums_idc<3){
3048                     if(reordering_of_pic_nums_idc<2){
3049                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3050                         int frame_num;
3051
3052                         if(abs_diff_pic_num > h->max_pic_num){
3053                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3054                             return -1;
3055                         }
3056
3057                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3058                         else                                pred+= abs_diff_pic_num;
3059                         pred &= h->max_pic_num - 1;
3060
3061                         frame_num = pic_num_extract(h, pred, &pic_structure);
3062
3063                         for(i= h->short_ref_count-1; i>=0; i--){
3064                             ref = h->short_ref[i];
3065                             assert(ref->reference);
3066                             assert(!ref->long_ref);
3067                             if(ref->data[0] != NULL &&
3068                                    ref->frame_num == frame_num &&
3069                                    (ref->reference & pic_structure) &&
3070                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3071                                 break;
3072                         }
3073                         if(i>=0)
3074                             ref->pic_id= pred;
3075                     }else{
3076                         int long_idx;
3077                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3078
3079                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3080
3081                         if(long_idx>31){
3082                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3083                             return -1;
3084                         }
3085                         ref = h->long_ref[long_idx];
3086                         assert(!(ref && !ref->reference));
3087                         if(ref && (ref->reference & pic_structure)){
3088                             ref->pic_id= pic_id;
3089                             assert(ref->long_ref);
3090                             i=0;
3091                         }else{
3092                             i=-1;
3093                         }
3094                     }
3095
3096                     if (i < 0) {
3097                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3098                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3099                     } else {
3100                         for(i=index; i+1<h->ref_count[list]; i++){
3101                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3102                                 break;
3103                         }
3104                         for(; i > index; i--){
3105                             h->ref_list[list][i]= h->ref_list[list][i-1];
3106                         }
3107                         h->ref_list[list][index]= *ref;
3108                         if (FIELD_PICTURE){
3109                             pic_as_field(&h->ref_list[list][index], pic_structure);
3110                         }
3111                     }
3112                 }else{
3113                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3114                     return -1;
3115                 }
3116             }
3117         }
3118     }
3119     for(list=0; list<h->list_count; list++){
3120         for(index= 0; index < h->ref_count[list]; index++){
3121             if(!h->ref_list[list][index].data[0])
3122                 h->ref_list[list][index]= s->current_picture;
3123         }
3124     }
3125
3126     if(h->slice_type==FF_B_TYPE && !h->direct_spatial_mv_pred)
3127         direct_dist_scale_factor(h);
3128     direct_ref_list_init(h);
3129     return 0;
3130 }
3131
3132 static void fill_mbaff_ref_list(H264Context *h){
3133     int list, i, j;
3134     for(list=0; list<2; list++){ //FIXME try list_count
3135         for(i=0; i<h->ref_count[list]; i++){
3136             Picture *frame = &h->ref_list[list][i];
3137             Picture *field = &h->ref_list[list][16+2*i];
3138             field[0] = *frame;
3139             for(j=0; j<3; j++)
3140                 field[0].linesize[j] <<= 1;
3141             field[0].reference = PICT_TOP_FIELD;
3142             field[1] = field[0];
3143             for(j=0; j<3; j++)
3144                 field[1].data[j] += frame->linesize[j];
3145             field[1].reference = PICT_BOTTOM_FIELD;
3146
3147             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3148             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3149             for(j=0; j<2; j++){
3150                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3151                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3152             }
3153         }
3154     }
3155     for(j=0; j<h->ref_count[1]; j++){
3156         for(i=0; i<h->ref_count[0]; i++)
3157             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3158         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3159         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3160     }
3161 }
3162
3163 static int pred_weight_table(H264Context *h){
3164     MpegEncContext * const s = &h->s;
3165     int list, i;
3166     int luma_def, chroma_def;
3167
3168     h->use_weight= 0;
3169     h->use_weight_chroma= 0;
3170     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3171     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3172     luma_def = 1<<h->luma_log2_weight_denom;
3173     chroma_def = 1<<h->chroma_log2_weight_denom;
3174
3175     for(list=0; list<2; list++){
3176         for(i=0; i<h->ref_count[list]; i++){
3177             int luma_weight_flag, chroma_weight_flag;
3178
3179             luma_weight_flag= get_bits1(&s->gb);
3180             if(luma_weight_flag){
3181                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3182                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3183                 if(   h->luma_weight[list][i] != luma_def
3184                    || h->luma_offset[list][i] != 0)
3185                     h->use_weight= 1;
3186             }else{
3187                 h->luma_weight[list][i]= luma_def;
3188                 h->luma_offset[list][i]= 0;
3189             }
3190
3191             chroma_weight_flag= get_bits1(&s->gb);
3192             if(chroma_weight_flag){
3193                 int j;
3194                 for(j=0; j<2; j++){
3195                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3196                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3197                     if(   h->chroma_weight[list][i][j] != chroma_def
3198                        || h->chroma_offset[list][i][j] != 0)
3199                         h->use_weight_chroma= 1;
3200                 }
3201             }else{
3202                 int j;
3203                 for(j=0; j<2; j++){
3204                     h->chroma_weight[list][i][j]= chroma_def;
3205                     h->chroma_offset[list][i][j]= 0;
3206                 }
3207             }
3208         }
3209         if(h->slice_type != FF_B_TYPE) break;
3210     }
3211     h->use_weight= h->use_weight || h->use_weight_chroma;
3212     return 0;
3213 }
3214
3215 static void implicit_weight_table(H264Context *h){
3216     MpegEncContext * const s = &h->s;
3217     int ref0, ref1;
3218     int cur_poc = s->current_picture_ptr->poc;
3219
3220     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3221        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3222         h->use_weight= 0;
3223         h->use_weight_chroma= 0;
3224         return;
3225     }
3226
3227     h->use_weight= 2;
3228     h->use_weight_chroma= 2;
3229     h->luma_log2_weight_denom= 5;
3230     h->chroma_log2_weight_denom= 5;
3231
3232     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3233         int poc0 = h->ref_list[0][ref0].poc;
3234         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3235             int poc1 = h->ref_list[1][ref1].poc;
3236             int td = av_clip(poc1 - poc0, -128, 127);
3237             if(td){
3238                 int tb = av_clip(cur_poc - poc0, -128, 127);
3239                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3240                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3241                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3242                     h->implicit_weight[ref0][ref1] = 32;
3243                 else
3244                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3245             }else
3246                 h->implicit_weight[ref0][ref1] = 32;
3247         }
3248     }
3249 }
3250
3251 /**
3252  * Mark a picture as no longer needed for reference. The refmask
3253  * argument allows unreferencing of individual fields or the whole frame.
3254  * If the picture becomes entirely unreferenced, but is being held for
3255  * display purposes, it is marked as such.
3256  * @param refmask mask of fields to unreference; the mask is bitwise
3257  *                anded with the reference marking of pic
3258  * @return non-zero if pic becomes entirely unreferenced (except possibly
3259  *         for display purposes) zero if one of the fields remains in
3260  *         reference
3261  */
3262 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3263     int i;
3264     if (pic->reference &= refmask) {
3265         return 0;
3266     } else {
3267         if(pic == h->delayed_output_pic)
3268             pic->reference=DELAYED_PIC_REF;
3269         else{
3270             for(i = 0; h->delayed_pic[i]; i++)
3271                 if(pic == h->delayed_pic[i]){
3272                     pic->reference=DELAYED_PIC_REF;
3273                     break;
3274                 }
3275         }
3276         return 1;
3277     }
3278 }
3279
3280 /**
3281  * instantaneous decoder refresh.
3282  */
3283 static void idr(H264Context *h){
3284     int i;
3285
3286     for(i=0; i<16; i++){
3287         if (h->long_ref[i] != NULL) {
3288             unreference_pic(h, h->long_ref[i], 0);
3289             h->long_ref[i]= NULL;
3290         }
3291     }
3292     h->long_ref_count=0;
3293
3294     for(i=0; i<h->short_ref_count; i++){
3295         unreference_pic(h, h->short_ref[i], 0);
3296         h->short_ref[i]= NULL;
3297     }
3298     h->short_ref_count=0;
3299 }
3300
3301 /* forget old pics after a seek */
3302 static void flush_dpb(AVCodecContext *avctx){
3303     H264Context *h= avctx->priv_data;
3304     int i;
3305     for(i=0; i<16; i++) {
3306         if(h->delayed_pic[i])
3307             h->delayed_pic[i]->reference= 0;
3308         h->delayed_pic[i]= NULL;
3309     }
3310     if(h->delayed_output_pic)
3311         h->delayed_output_pic->reference= 0;
3312     h->delayed_output_pic= NULL;
3313     idr(h);
3314     if(h->s.current_picture_ptr)
3315         h->s.current_picture_ptr->reference= 0;
3316     h->s.first_field= 0;
3317     ff_mpeg_flush(avctx);
3318 }
3319
3320 /**
3321  * Find a Picture in the short term reference list by frame number.
3322  * @param frame_num frame number to search for
3323  * @param idx the index into h->short_ref where returned picture is found
3324  *            undefined if no picture found.
3325  * @return pointer to the found picture, or NULL if no pic with the provided
3326  *                 frame number is found
3327  */
3328 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3329     MpegEncContext * const s = &h->s;
3330     int i;
3331
3332     for(i=0; i<h->short_ref_count; i++){
3333         Picture *pic= h->short_ref[i];
3334         if(s->avctx->debug&FF_DEBUG_MMCO)
3335             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3336         if(pic->frame_num == frame_num) {
3337             *idx = i;
3338             return pic;
3339         }
3340     }
3341     return NULL;
3342 }
3343
3344 /**
3345  * Remove a picture from the short term reference list by its index in
3346  * that list.  This does no checking on the provided index; it is assumed
3347  * to be valid. Other list entries are shifted down.
3348  * @param i index into h->short_ref of picture to remove.
3349  */
3350 static void remove_short_at_index(H264Context *h, int i){
3351     assert(i > 0 && i < h->short_ref_count);
3352     h->short_ref[i]= NULL;
3353     if (--h->short_ref_count)
3354         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3355 }
3356
3357 /**
3358  *
3359  * @return the removed picture or NULL if an error occurs
3360  */
3361 static Picture * remove_short(H264Context *h, int frame_num){
3362     MpegEncContext * const s = &h->s;
3363     Picture *pic;
3364     int i;
3365
3366     if(s->avctx->debug&FF_DEBUG_MMCO)
3367         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3368
3369     pic = find_short(h, frame_num, &i);
3370     if (pic)
3371         remove_short_at_index(h, i);
3372
3373     return pic;
3374 }
3375
3376 /**
3377  * Remove a picture from the long term reference list by its index in
3378  * that list.  This does no checking on the provided index; it is assumed
3379  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3380  * @param i index into h->long_ref of picture to remove.
3381  */
3382 static void remove_long_at_index(H264Context *h, int i){
3383     h->long_ref[i]= NULL;
3384     h->long_ref_count--;
3385 }
3386
3387 /**
3388  *
3389  * @return the removed picture or NULL if an error occurs
3390  */
3391 static Picture * remove_long(H264Context *h, int i){
3392     Picture *pic;
3393
3394     pic= h->long_ref[i];
3395     if (pic)
3396         remove_long_at_index(h, i);
3397
3398     return pic;
3399 }
3400
3401 /**
3402  * print short term list
3403  */
3404 static void print_short_term(H264Context *h) {
3405     uint32_t i;
3406     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3407         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3408         for(i=0; i<h->short_ref_count; i++){
3409             Picture *pic= h->short_ref[i];
3410             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3411         }
3412     }
3413 }
3414
3415 /**
3416  * print long term list
3417  */
3418 static void print_long_term(H264Context *h) {
3419     uint32_t i;
3420     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3421         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3422         for(i = 0; i < 16; i++){
3423             Picture *pic= h->long_ref[i];
3424             if (pic) {
3425                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3426             }
3427         }
3428     }
3429 }
3430
3431 /**
3432  * Executes the reference picture marking (memory management control operations).
3433  */
3434 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3435     MpegEncContext * const s = &h->s;
3436     int i, j;
3437     int current_ref_assigned=0;
3438     Picture *pic;
3439
3440     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3441         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3442
3443     for(i=0; i<mmco_count; i++){
3444         int structure, frame_num, unref_pic;
3445         if(s->avctx->debug&FF_DEBUG_MMCO)
3446             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3447
3448         switch(mmco[i].opcode){
3449         case MMCO_SHORT2UNUSED:
3450             if(s->avctx->debug&FF_DEBUG_MMCO)
3451                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3452             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3453             pic = find_short(h, frame_num, &j);
3454             if (pic) {
3455                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3456                     remove_short_at_index(h, j);
3457             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3458                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3459             break;
3460         case MMCO_SHORT2LONG:
3461             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3462                     h->long_ref[mmco[i].long_arg]->frame_num ==
3463                                               mmco[i].short_pic_num / 2) {
3464                 /* do nothing, we've already moved this field pair. */
3465             } else {
3466                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3467
3468                 pic= remove_long(h, mmco[i].long_arg);
3469                 if(pic) unreference_pic(h, pic, 0);
3470
3471                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3472                 if (h->long_ref[ mmco[i].long_arg ]){
3473                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3474                     h->long_ref_count++;
3475                 }
3476             }
3477             break;
3478         case MMCO_LONG2UNUSED:
3479             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3480             pic = h->long_ref[j];
3481             if (pic) {
3482                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3483                     remove_long_at_index(h, j);
3484             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3485                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3486             break;
3487         case MMCO_LONG:
3488             unref_pic = 1;
3489             if (FIELD_PICTURE && !s->first_field) {
3490                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3491                     /* Just mark second field as referenced */
3492                     unref_pic = 0;
3493                 } else if (s->current_picture_ptr->reference) {
3494                     /* First field in pair is in short term list or
3495                      * at a different long term index.
3496                      * This is not allowed; see 7.4.3, notes 2 and 3.
3497                      * Report the problem and keep the pair where it is,
3498                      * and mark this field valid.
3499                      */
3500                     av_log(h->s.avctx, AV_LOG_ERROR,
3501                         "illegal long term reference assignment for second "
3502                         "field in complementary field pair (first field is "
3503                         "short term or has non-matching long index)\n");
3504                     unref_pic = 0;
3505                 }
3506             }
3507
3508             if (unref_pic) {
3509                 pic= remove_long(h, mmco[i].long_arg);
3510                 if(pic) unreference_pic(h, pic, 0);
3511
3512                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3513                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3514                 h->long_ref_count++;
3515             }
3516
3517             s->current_picture_ptr->reference |= s->picture_structure;
3518             current_ref_assigned=1;
3519             break;
3520         case MMCO_SET_MAX_LONG:
3521             assert(mmco[i].long_arg <= 16);
3522             // just remove the long term which index is greater than new max
3523             for(j = mmco[i].long_arg; j<16; j++){
3524                 pic = remove_long(h, j);
3525                 if (pic) unreference_pic(h, pic, 0);
3526             }
3527             break;
3528         case MMCO_RESET:
3529             while(h->short_ref_count){
3530                 pic= remove_short(h, h->short_ref[0]->frame_num);
3531                 if(pic) unreference_pic(h, pic, 0);
3532             }
3533             for(j = 0; j < 16; j++) {
3534                 pic= remove_long(h, j);
3535                 if(pic) unreference_pic(h, pic, 0);
3536             }
3537             break;
3538         default: assert(0);
3539         }
3540     }
3541
3542     if (!current_ref_assigned && FIELD_PICTURE &&
3543             !s->first_field && s->current_picture_ptr->reference) {
3544
3545         /* Second field of complementary field pair; the first field of
3546          * which is already referenced. If short referenced, it
3547          * should be first entry in short_ref. If not, it must exist
3548          * in long_ref; trying to put it on the short list here is an
3549          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3550          */
3551         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3552             /* Just mark the second field valid */
3553             s->current_picture_ptr->reference = PICT_FRAME;
3554         } else if (s->current_picture_ptr->long_ref) {
3555             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3556                                              "assignment for second field "
3557                                              "in complementary field pair "
3558                                              "(first field is long term)\n");
3559         } else {
3560             /*
3561              * First field in reference, but not in any sensible place on our
3562              * reference lists. This shouldn't happen unless reference
3563              * handling somewhere else is wrong.
3564              */
3565             assert(0);
3566         }
3567         current_ref_assigned = 1;
3568     }
3569
3570     if(!current_ref_assigned){
3571         pic= remove_short(h, s->current_picture_ptr->frame_num);
3572         if(pic){
3573             unreference_pic(h, pic, 0);
3574             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3575         }
3576
3577         if(h->short_ref_count)
3578             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3579
3580         h->short_ref[0]= s->current_picture_ptr;
3581         h->short_ref[0]->long_ref=0;
3582         h->short_ref_count++;
3583         s->current_picture_ptr->reference |= s->picture_structure;
3584     }
3585
3586     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3587
3588         /* We have too many reference frames, probably due to corrupted
3589          * stream. Need to discard one frame. Prevents overrun of the
3590          * short_ref and long_ref buffers.
3591          */
3592         av_log(h->s.avctx, AV_LOG_ERROR,
3593                "number of reference frames exceeds max (probably "
3594                "corrupt input), discarding one\n");
3595
3596         if (h->long_ref_count) {
3597             for (i = 0; i < 16; ++i)
3598                 if (h->long_ref[i])
3599                     break;
3600
3601             assert(i < 16);
3602             pic = h->long_ref[i];
3603             remove_long_at_index(h, i);
3604         } else {
3605             pic = h->short_ref[h->short_ref_count - 1];
3606             remove_short_at_index(h, h->short_ref_count - 1);
3607         }
3608         unreference_pic(h, pic, 0);
3609     }
3610
3611     print_short_term(h);
3612     print_long_term(h);
3613     return 0;
3614 }
3615
3616 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3617     MpegEncContext * const s = &h->s;
3618     int i;
3619
3620     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3621         s->broken_link= get_bits1(gb) -1;
3622         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3623         if(h->mmco[0].long_arg == -1)
3624             h->mmco_index= 0;
3625         else{
3626             h->mmco[0].opcode= MMCO_LONG;
3627             h->mmco_index= 1;
3628         }
3629     }else{
3630         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3631             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3632                 MMCOOpcode opcode= get_ue_golomb(gb);
3633
3634                 h->mmco[i].opcode= opcode;
3635                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3636                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3637 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3638                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3639                         return -1;
3640                     }*/
3641                 }
3642                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3643                     unsigned int long_arg= get_ue_golomb(gb);
3644                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3645                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3646                         return -1;
3647                     }
3648                     h->mmco[i].long_arg= long_arg;
3649                 }
3650
3651                 if(opcode > (unsigned)MMCO_LONG){
3652                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3653                     return -1;
3654                 }
3655                 if(opcode == MMCO_END)
3656                     break;
3657             }
3658             h->mmco_index= i;
3659         }else{
3660             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3661
3662             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3663                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3664                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3665                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3666                 h->mmco_index= 1;
3667                 if (FIELD_PICTURE) {
3668                     h->mmco[0].short_pic_num *= 2;
3669                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3670                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3671                     h->mmco_index= 2;
3672                 }
3673             }else
3674                 h->mmco_index= 0;
3675         }
3676     }
3677
3678     return 0;
3679 }
3680
3681 static int init_poc(H264Context *h){
3682     MpegEncContext * const s = &h->s;
3683     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3684     int field_poc[2];
3685
3686     if(h->nal_unit_type == NAL_IDR_SLICE){
3687         h->frame_num_offset= 0;
3688     }else{
3689         if(h->frame_num < h->prev_frame_num)
3690             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3691         else
3692             h->frame_num_offset= h->prev_frame_num_offset;
3693     }
3694
3695     if(h->sps.poc_type==0){
3696         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3697
3698         if(h->nal_unit_type == NAL_IDR_SLICE){
3699              h->prev_poc_msb=
3700              h->prev_poc_lsb= 0;
3701         }
3702
3703         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3704             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3705         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3706             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3707         else
3708             h->poc_msb = h->prev_poc_msb;
3709 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3710         field_poc[0] =
3711         field_poc[1] = h->poc_msb + h->poc_lsb;
3712         if(s->picture_structure == PICT_FRAME)
3713             field_poc[1] += h->delta_poc_bottom;
3714     }else if(h->sps.poc_type==1){
3715         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3716         int i;
3717
3718         if(h->sps.poc_cycle_length != 0)
3719             abs_frame_num = h->frame_num_offset + h->frame_num;
3720         else
3721             abs_frame_num = 0;
3722
3723         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3724             abs_frame_num--;
3725
3726         expected_delta_per_poc_cycle = 0;
3727         for(i=0; i < h->sps.poc_cycle_length; i++)
3728             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3729
3730         if(abs_frame_num > 0){
3731             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3732             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3733
3734             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3735             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3736                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3737         } else
3738             expectedpoc = 0;
3739
3740         if(h->nal_ref_idc == 0)
3741             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3742
3743         field_poc[0] = expectedpoc + h->delta_poc[0];
3744         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3745
3746         if(s->picture_structure == PICT_FRAME)
3747             field_poc[1] += h->delta_poc[1];
3748     }else{
3749         int poc;
3750         if(h->nal_unit_type == NAL_IDR_SLICE){
3751             poc= 0;
3752         }else{
3753             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3754             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3755         }
3756         field_poc[0]= poc;
3757         field_poc[1]= poc;
3758     }
3759
3760     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3761         s->current_picture_ptr->field_poc[0]= field_poc[0];
3762         s->current_picture_ptr->poc = field_poc[0];
3763     }
3764     if(s->picture_structure != PICT_TOP_FIELD) {
3765         s->current_picture_ptr->field_poc[1]= field_poc[1];
3766         s->current_picture_ptr->poc = field_poc[1];
3767     }
3768     if(!FIELD_PICTURE || !s->first_field) {
3769         Picture *cur = s->current_picture_ptr;
3770         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3771     }
3772
3773     return 0;
3774 }
3775
3776
3777 /**
3778  * initialize scan tables
3779  */
3780 static void init_scan_tables(H264Context *h){
3781     MpegEncContext * const s = &h->s;
3782     int i;
3783     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3784         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3785         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3786     }else{
3787         for(i=0; i<16; i++){
3788 #define T(x) (x>>2) | ((x<<2) & 0xF)
3789             h->zigzag_scan[i] = T(zigzag_scan[i]);
3790             h-> field_scan[i] = T( field_scan[i]);
3791 #undef T
3792         }
3793     }
3794     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3795         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3796         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3797         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3798         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3799     }else{
3800         for(i=0; i<64; i++){
3801 #define T(x) (x>>3) | ((x&7)<<3)
3802             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3803             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3804             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3805             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3806 #undef T
3807         }
3808     }
3809     if(h->sps.transform_bypass){ //FIXME same ugly
3810         h->zigzag_scan_q0          = zigzag_scan;
3811         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3812         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3813         h->field_scan_q0           = field_scan;
3814         h->field_scan8x8_q0        = field_scan8x8;
3815         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3816     }else{
3817         h->zigzag_scan_q0          = h->zigzag_scan;
3818         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3819         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3820         h->field_scan_q0           = h->field_scan;
3821         h->field_scan8x8_q0        = h->field_scan8x8;
3822         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3823     }
3824 }
3825
3826 /**
3827  * Replicates H264 "master" context to thread contexts.
3828  */
3829 static void clone_slice(H264Context *dst, H264Context *src)
3830 {
3831     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3832     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3833     dst->s.current_picture      = src->s.current_picture;
3834     dst->s.linesize             = src->s.linesize;
3835     dst->s.uvlinesize           = src->s.uvlinesize;
3836     dst->s.first_field          = src->s.first_field;
3837
3838     dst->prev_poc_msb           = src->prev_poc_msb;
3839     dst->prev_poc_lsb           = src->prev_poc_lsb;
3840     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3841     dst->prev_frame_num         = src->prev_frame_num;
3842     dst->short_ref_count        = src->short_ref_count;
3843
3844     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3845     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3846     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3847     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3848
3849     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3850     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3851 }
3852
3853 /**
3854  * decodes a slice header.
3855  * This will also call MPV_common_init() and frame_start() as needed.
3856  *
3857  * @param h h264context
3858  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3859  *
3860  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3861  */
3862 static int decode_slice_header(H264Context *h, H264Context *h0){
3863     MpegEncContext * const s = &h->s;
3864     MpegEncContext * const s0 = &h0->s;
3865     unsigned int first_mb_in_slice;
3866     unsigned int pps_id;
3867     int num_ref_idx_active_override_flag;
3868     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3869     unsigned int slice_type, tmp, i;
3870     int default_ref_list_done = 0;
3871     int last_pic_structure;
3872
3873     s->dropable= h->nal_ref_idc == 0;
3874
3875     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3876         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3877         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3878     }else{
3879         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3880         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3881     }
3882
3883     first_mb_in_slice= get_ue_golomb(&s->gb);
3884
3885     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3886         h0->current_slice = 0;
3887         if (!s0->first_field)
3888             s->current_picture_ptr= NULL;
3889     }
3890
3891     slice_type= get_ue_golomb(&s->gb);
3892     if(slice_type > 9){
3893         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3894         return -1;
3895     }
3896     if(slice_type > 4){
3897         slice_type -= 5;
3898         h->slice_type_fixed=1;
3899     }else
3900         h->slice_type_fixed=0;
3901
3902     slice_type= slice_type_map[ slice_type ];
3903     if (slice_type == FF_I_TYPE
3904         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3905         default_ref_list_done = 1;
3906     }
3907     h->slice_type= slice_type;
3908
3909     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3910     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3911         av_log(h->s.avctx, AV_LOG_ERROR,
3912                "B picture before any references, skipping\n");
3913         return -1;
3914     }
3915
3916     pps_id= get_ue_golomb(&s->gb);
3917     if(pps_id>=MAX_PPS_COUNT){
3918         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3919         return -1;
3920     }
3921     if(!h0->pps_buffers[pps_id]) {
3922         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3923         return -1;
3924     }
3925     h->pps= *h0->pps_buffers[pps_id];
3926
3927     if(!h0->sps_buffers[h->pps.sps_id]) {
3928         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3929         return -1;
3930     }
3931     h->sps = *h0->sps_buffers[h->pps.sps_id];
3932
3933     if(h == h0 && h->dequant_coeff_pps != pps_id){
3934         h->dequant_coeff_pps = pps_id;
3935         init_dequant_tables(h);
3936     }
3937
3938     s->mb_width= h->sps.mb_width;
3939     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3940
3941     h->b_stride=  s->mb_width*4;
3942     h->b8_stride= s->mb_width*2;
3943
3944     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3945     if(h->sps.frame_mbs_only_flag)
3946         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3947     else
3948         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3949
3950     if (s->context_initialized
3951         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3952         if(h != h0)
3953             return -1;   // width / height changed during parallelized decoding
3954         free_tables(h);
3955         MPV_common_end(s);
3956     }
3957     if (!s->context_initialized) {
3958         if(h != h0)
3959             return -1;  // we cant (re-)initialize context during parallel decoding
3960         if (MPV_common_init(s) < 0)
3961             return -1;
3962         s->first_field = 0;
3963
3964         init_scan_tables(h);
3965         alloc_tables(h);
3966
3967         for(i = 1; i < s->avctx->thread_count; i++) {
3968             H264Context *c;
3969             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3970             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3971             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3972             c->sps = h->sps;
3973             c->pps = h->pps;
3974             init_scan_tables(c);
3975             clone_tables(c, h);
3976         }
3977
3978         for(i = 0; i < s->avctx->thread_count; i++)
3979             if(context_init(h->thread_context[i]) < 0)
3980                 return -1;
3981
3982         s->avctx->width = s->width;
3983         s->avctx->height = s->height;
3984         s->avctx->sample_aspect_ratio= h->sps.sar;
3985         if(!s->avctx->sample_aspect_ratio.den)
3986             s->avctx->sample_aspect_ratio.den = 1;
3987
3988         if(h->sps.timing_info_present_flag){
3989             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3990             if(h->x264_build > 0 && h->x264_build < 44)
3991                 s->avctx->time_base.den *= 2;
3992             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3993                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3994         }
3995     }
3996
3997     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3998
3999     h->mb_mbaff = 0;
4000     h->mb_aff_frame = 0;
4001     last_pic_structure = s0->picture_structure;
4002     if(h->sps.frame_mbs_only_flag){
4003         s->picture_structure= PICT_FRAME;
4004     }else{
4005         if(get_bits1(&s->gb)) { //field_pic_flag
4006             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4007         } else {
4008             s->picture_structure= PICT_FRAME;
4009             h->mb_aff_frame = h->sps.mb_aff;
4010         }
4011     }
4012
4013     if(h0->current_slice == 0){
4014         /* See if we have a decoded first field looking for a pair... */
4015         if (s0->first_field) {
4016             assert(s0->current_picture_ptr);
4017             assert(s0->current_picture_ptr->data[0]);
4018             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4019
4020             /* figure out if we have a complementary field pair */
4021             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4022                 /*
4023                  * Previous field is unmatched. Don't display it, but let it
4024                  * remain for reference if marked as such.
4025                  */
4026                 s0->current_picture_ptr = NULL;
4027                 s0->first_field = FIELD_PICTURE;
4028
4029             } else {
4030                 if (h->nal_ref_idc &&
4031                         s0->current_picture_ptr->reference &&
4032                         s0->current_picture_ptr->frame_num != h->frame_num) {
4033                     /*
4034                      * This and previous field were reference, but had
4035                      * different frame_nums. Consider this field first in
4036                      * pair. Throw away previous field except for reference
4037                      * purposes.
4038                      */
4039                     s0->first_field = 1;
4040                     s0->current_picture_ptr = NULL;
4041
4042                 } else {
4043                     /* Second field in complementary pair */
4044                     s0->first_field = 0;
4045                 }
4046             }
4047
4048         } else {
4049             /* Frame or first field in a potentially complementary pair */
4050             assert(!s0->current_picture_ptr);
4051             s0->first_field = FIELD_PICTURE;
4052         }
4053
4054         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4055             s0->first_field = 0;
4056             return -1;
4057         }
4058     }
4059     if(h != h0)
4060         clone_slice(h, h0);
4061
4062     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4063
4064     assert(s->mb_num == s->mb_width * s->mb_height);
4065     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4066        first_mb_in_slice                    >= s->mb_num){
4067         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4068         return -1;
4069     }
4070     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4071     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4072     if (s->picture_structure == PICT_BOTTOM_FIELD)
4073         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4074     assert(s->mb_y < s->mb_height);
4075
4076     if(s->picture_structure==PICT_FRAME){
4077         h->curr_pic_num=   h->frame_num;
4078         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4079     }else{
4080         h->curr_pic_num= 2*h->frame_num + 1;
4081         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4082     }
4083
4084     if(h->nal_unit_type == NAL_IDR_SLICE){
4085         get_ue_golomb(&s->gb); /* idr_pic_id */
4086     }
4087
4088     if(h->sps.poc_type==0){
4089         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4090
4091         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4092             h->delta_poc_bottom= get_se_golomb(&s->gb);
4093         }
4094     }
4095
4096     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4097         h->delta_poc[0]= get_se_golomb(&s->gb);
4098
4099         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4100             h->delta_poc[1]= get_se_golomb(&s->gb);
4101     }
4102
4103     init_poc(h);
4104
4105     if(h->pps.redundant_pic_cnt_present){
4106         h->redundant_pic_count= get_ue_golomb(&s->gb);
4107     }
4108
4109     //set defaults, might be overriden a few line later
4110     h->ref_count[0]= h->pps.ref_count[0];
4111     h->ref_count[1]= h->pps.ref_count[1];
4112
4113     if(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE || h->slice_type == FF_B_TYPE){
4114         if(h->slice_type == FF_B_TYPE){
4115             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4116             if(FIELD_PICTURE && h->direct_spatial_mv_pred)
4117                 av_log(h->s.avctx, AV_LOG_ERROR, "PAFF + spatial direct mode is not implemented\n");
4118         }
4119         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4120
4121         if(num_ref_idx_active_override_flag){
4122             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4123             if(h->slice_type==FF_B_TYPE)
4124                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4125
4126             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4127                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4128                 h->ref_count[0]= h->ref_count[1]= 1;
4129                 return -1;
4130             }
4131         }
4132         if(h->slice_type == FF_B_TYPE)
4133             h->list_count= 2;
4134         else
4135             h->list_count= 1;
4136     }else
4137         h->list_count= 0;
4138
4139     if(!default_ref_list_done){
4140         fill_default_ref_list(h);
4141     }
4142
4143     if(decode_ref_pic_list_reordering(h) < 0)
4144         return -1;
4145
4146     if(   (h->pps.weighted_pred          && (h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE ))
4147        || (h->pps.weighted_bipred_idc==1 && h->slice_type==FF_B_TYPE ) )
4148         pred_weight_table(h);
4149     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==FF_B_TYPE)
4150         implicit_weight_table(h);
4151     else
4152         h->use_weight = 0;
4153
4154     if(h->nal_ref_idc)
4155         decode_ref_pic_marking(h0, &s->gb);
4156
4157     if(FRAME_MBAFF)
4158         fill_mbaff_ref_list(h);
4159
4160     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE && h->pps.cabac ){
4161         tmp = get_ue_golomb(&s->gb);
4162         if(tmp > 2){
4163             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4164             return -1;
4165         }
4166         h->cabac_init_idc= tmp;
4167     }
4168
4169     h->last_qscale_diff = 0;
4170     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4171     if(tmp>51){
4172         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4173         return -1;
4174     }
4175     s->qscale= tmp;
4176     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4177     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4178     //FIXME qscale / qp ... stuff
4179     if(h->slice_type == FF_SP_TYPE){
4180         get_bits1(&s->gb); /* sp_for_switch_flag */
4181     }
4182     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4183         get_se_golomb(&s->gb); /* slice_qs_delta */
4184     }
4185
4186     h->deblocking_filter = 1;
4187     h->slice_alpha_c0_offset = 0;
4188     h->slice_beta_offset = 0;
4189     if( h->pps.deblocking_filter_parameters_present ) {
4190         tmp= get_ue_golomb(&s->gb);
4191         if(tmp > 2){
4192             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4193             return -1;
4194         }
4195         h->deblocking_filter= tmp;
4196         if(h->deblocking_filter < 2)
4197             h->deblocking_filter^= 1; // 1<->0
4198
4199         if( h->deblocking_filter ) {
4200             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4201             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4202         }
4203     }
4204
4205     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4206        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != FF_I_TYPE)
4207        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == FF_B_TYPE)
4208        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4209         h->deblocking_filter= 0;
4210
4211     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4212         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4213             /* Cheat slightly for speed:
4214                Do not bother to deblock across slices. */
4215             h->deblocking_filter = 2;
4216         } else {
4217             h0->max_contexts = 1;
4218             if(!h0->single_decode_warning) {
4219                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4220                 h0->single_decode_warning = 1;
4221             }
4222             if(h != h0)
4223                 return 1; // deblocking switched inside frame
4224         }
4225     }
4226
4227 #if 0 //FMO
4228     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4229         slice_group_change_cycle= get_bits(&s->gb, ?);
4230 #endif
4231
4232     h0->last_slice_type = slice_type;
4233     h->slice_num = ++h0->current_slice;
4234
4235     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4236     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4237
4238     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4239         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4240                h->slice_num,
4241                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4242                first_mb_in_slice,
4243                av_get_pict_type_char(h->slice_type),
4244                pps_id, h->frame_num,
4245                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4246                h->ref_count[0], h->ref_count[1],
4247                s->qscale,
4248                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4249                h->use_weight,
4250                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4251                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4252                );
4253     }
4254
4255     return 0;
4256 }
4257
4258 /**
4259  *
4260  */
4261 static inline int get_level_prefix(GetBitContext *gb){
4262     unsigned int buf;
4263     int log;
4264
4265     OPEN_READER(re, gb);
4266     UPDATE_CACHE(re, gb);
4267     buf=GET_CACHE(re, gb);
4268
4269     log= 32 - av_log2(buf);
4270 #ifdef TRACE
4271     print_bin(buf>>(32-log), log);
4272     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4273 #endif
4274
4275     LAST_SKIP_BITS(re, gb, log);
4276     CLOSE_READER(re, gb);
4277
4278     return log-1;
4279 }
4280
4281 static inline int get_dct8x8_allowed(H264Context *h){
4282     int i;
4283     for(i=0; i<4; i++){
4284         if(!IS_SUB_8X8(h->sub_mb_type[i])
4285            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4286             return 0;
4287     }
4288     return 1;
4289 }
4290
4291 /**
4292  * decodes a residual block.
4293  * @param n block index
4294  * @param scantable scantable
4295  * @param max_coeff number of coefficients in the block
4296  * @return <0 if an error occurred
4297  */
4298 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4299     MpegEncContext * const s = &h->s;
4300     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4301     int level[16];
4302     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4303
4304     //FIXME put trailing_onex into the context
4305
4306     if(n == CHROMA_DC_BLOCK_INDEX){
4307         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4308         total_coeff= coeff_token>>2;
4309     }else{
4310         if(n == LUMA_DC_BLOCK_INDEX){
4311             total_coeff= pred_non_zero_count(h, 0);
4312             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4313             total_coeff= coeff_token>>2;
4314         }else{
4315             total_coeff= pred_non_zero_count(h, n);
4316             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4317             total_coeff= coeff_token>>2;
4318             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4319         }
4320     }
4321
4322     //FIXME set last_non_zero?
4323
4324     if(total_coeff==0)
4325         return 0;
4326     if(total_coeff > (unsigned)max_coeff) {
4327         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4328         return -1;
4329     }
4330
4331     trailing_ones= coeff_token&3;
4332     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4333     assert(total_coeff<=16);
4334
4335     for(i=0; i<trailing_ones; i++){
4336         level[i]= 1 - 2*get_bits1(gb);
4337     }
4338
4339     if(i<total_coeff) {
4340         int level_code, mask;
4341         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4342         int prefix= get_level_prefix(gb);
4343
4344         //first coefficient has suffix_length equal to 0 or 1
4345         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4346             if(suffix_length)
4347                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4348             else
4349                 level_code= (prefix<<suffix_length); //part
4350         }else if(prefix==14){
4351             if(suffix_length)
4352                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4353             else
4354                 level_code= prefix + get_bits(gb, 4); //part
4355         }else{
4356             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4357             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4358             if(prefix>=16)
4359                 level_code += (1<<(prefix-3))-4096;
4360         }
4361
4362         if(trailing_ones < 3) level_code += 2;
4363
4364         suffix_length = 1;
4365         if(level_code > 5)
4366             suffix_length++;
4367         mask= -(level_code&1);
4368         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4369         i++;
4370
4371         //remaining coefficients have suffix_length > 0
4372         for(;i<total_coeff;i++) {
4373             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4374             prefix = get_level_prefix(gb);
4375             if(prefix<15){
4376                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4377             }else{
4378                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4379                 if(prefix>=16)
4380                     level_code += (1<<(prefix-3))-4096;
4381             }
4382             mask= -(level_code&1);
4383             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4384             if(level_code > suffix_limit[suffix_length])
4385                 suffix_length++;
4386         }
4387     }
4388
4389     if(total_coeff == max_coeff)
4390         zeros_left=0;
4391     else{
4392         if(n == CHROMA_DC_BLOCK_INDEX)
4393             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4394         else
4395             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4396     }
4397
4398     coeff_num = zeros_left + total_coeff - 1;
4399     j = scantable[coeff_num];
4400     if(n > 24){
4401         block[j] = level[0];
4402         for(i=1;i<total_coeff;i++) {
4403             if(zeros_left <= 0)
4404                 run_before = 0;
4405             else if(zeros_left < 7){
4406                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4407             }else{
4408                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4409             }
4410             zeros_left -= run_before;
4411             coeff_num -= 1 + run_before;
4412             j= scantable[ coeff_num ];
4413
4414             block[j]= level[i];
4415         }
4416     }else{
4417         block[j] = (level[0] * qmul[j] + 32)>>6;
4418         for(i=1;i<total_coeff;i++) {
4419             if(zeros_left <= 0)
4420                 run_before = 0;
4421             else if(zeros_left < 7){
4422                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4423             }else{
4424                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4425             }
4426             zeros_left -= run_before;
4427             coeff_num -= 1 + run_before;
4428             j= scantable[ coeff_num ];
4429
4430             block[j]= (level[i] * qmul[j] + 32)>>6;
4431         }
4432     }
4433
4434     if(zeros_left<0){
4435         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4436         return -1;
4437     }
4438
4439     return 0;
4440 }
4441
4442 static void predict_field_decoding_flag(H264Context *h){
4443     MpegEncContext * const s = &h->s;
4444     const int mb_xy= h->mb_xy;
4445     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4446                 ? s->current_picture.mb_type[mb_xy-1]
4447                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4448                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4449                 : 0;
4450     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4451 }
4452
4453 /**
4454  * decodes a P_SKIP or B_SKIP macroblock
4455  */
4456 static void decode_mb_skip(H264Context *h){
4457     MpegEncContext * const s = &h->s;
4458     const int mb_xy= h->mb_xy;
4459     int mb_type=0;
4460
4461     memset(h->non_zero_count[mb_xy], 0, 16);
4462     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4463
4464     if(MB_FIELD)
4465         mb_type|= MB_TYPE_INTERLACED;
4466
4467     if( h->slice_type == FF_B_TYPE )
4468     {
4469         // just for fill_caches. pred_direct_motion will set the real mb_type
4470         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4471
4472         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4473         pred_direct_motion(h, &mb_type);
4474         mb_type|= MB_TYPE_SKIP;
4475     }
4476     else
4477     {
4478         int mx, my;
4479         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4480
4481         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4482         pred_pskip_motion(h, &mx, &my);
4483         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4484         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4485     }
4486
4487     write_back_motion(h, mb_type);
4488     s->current_picture.mb_type[mb_xy]= mb_type;
4489     s->current_picture.qscale_table[mb_xy]= s->qscale;
4490     h->slice_table[ mb_xy ]= h->slice_num;
4491     h->prev_mb_skipped= 1;
4492 }
4493
4494 /**
4495  * decodes a macroblock
4496  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4497  */
4498 static int decode_mb_cavlc(H264Context *h){
4499     MpegEncContext * const s = &h->s;
4500     int mb_xy;
4501     int partition_count;
4502     unsigned int mb_type, cbp;
4503     int dct8x8_allowed= h->pps.transform_8x8_mode;
4504
4505     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4506
4507     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4508
4509     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4510     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4511                 down the code */
4512     if(h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE){
4513         if(s->mb_skip_run==-1)
4514             s->mb_skip_run= get_ue_golomb(&s->gb);
4515
4516         if (s->mb_skip_run--) {
4517             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4518                 if(s->mb_skip_run==0)
4519                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4520                 else
4521                     predict_field_decoding_flag(h);
4522             }
4523             decode_mb_skip(h);
4524             return 0;
4525         }
4526     }
4527     if(FRAME_MBAFF){
4528         if( (s->mb_y&1) == 0 )
4529             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4530     }else
4531         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4532
4533     h->prev_mb_skipped= 0;
4534
4535     mb_type= get_ue_golomb(&s->gb);
4536     if(h->slice_type == FF_B_TYPE){
4537         if(mb_type < 23){
4538             partition_count= b_mb_type_info[mb_type].partition_count;
4539             mb_type=         b_mb_type_info[mb_type].type;
4540         }else{
4541             mb_type -= 23;
4542             goto decode_intra_mb;
4543         }
4544     }else if(h->slice_type == FF_P_TYPE /*|| h->slice_type == FF_SP_TYPE */){
4545         if(mb_type < 5){
4546             partition_count= p_mb_type_info[mb_type].partition_count;
4547             mb_type=         p_mb_type_info[mb_type].type;
4548         }else{
4549             mb_type -= 5;
4550             goto decode_intra_mb;
4551         }
4552     }else{
4553        assert(h->slice_type == FF_I_TYPE);
4554 decode_intra_mb:
4555         if(mb_type > 25){
4556             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4557             return -1;
4558         }
4559         partition_count=0;
4560         cbp= i_mb_type_info[mb_type].cbp;
4561         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4562         mb_type= i_mb_type_info[mb_type].type;
4563     }
4564
4565     if(MB_FIELD)
4566         mb_type |= MB_TYPE_INTERLACED;
4567
4568     h->slice_table[ mb_xy ]= h->slice_num;
4569
4570     if(IS_INTRA_PCM(mb_type)){
4571         unsigned int x, y;
4572
4573         // We assume these blocks are very rare so we do not optimize it.
4574         align_get_bits(&s->gb);
4575
4576         // The pixels are stored in the same order as levels in h->mb array.
4577         for(y=0; y<16; y++){
4578             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4579             for(x=0; x<16; x++){
4580                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4581                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4582             }
4583         }
4584         for(y=0; y<8; y++){
4585             const int index= 256 + 4*(y&3) + 32*(y>>2);
4586             for(x=0; x<8; x++){
4587                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4588                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4589             }
4590         }
4591         for(y=0; y<8; y++){
4592             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4593             for(x=0; x<8; x++){
4594                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4595                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4596             }
4597         }
4598
4599         // In deblocking, the quantizer is 0
4600         s->current_picture.qscale_table[mb_xy]= 0;
4601         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4602         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4603         // All coeffs are present
4604         memset(h->non_zero_count[mb_xy], 16, 16);
4605
4606         s->current_picture.mb_type[mb_xy]= mb_type;
4607         return 0;
4608     }
4609
4610     if(MB_MBAFF){
4611         h->ref_count[0] <<= 1;
4612         h->ref_count[1] <<= 1;
4613     }
4614
4615     fill_caches(h, mb_type, 0);
4616
4617     //mb_pred
4618     if(IS_INTRA(mb_type)){
4619             int pred_mode;
4620 //            init_top_left_availability(h);
4621             if(IS_INTRA4x4(mb_type)){
4622                 int i;
4623                 int di = 1;
4624                 if(dct8x8_allowed && get_bits1(&s->gb)){
4625                     mb_type |= MB_TYPE_8x8DCT;
4626                     di = 4;
4627                 }
4628
4629 //                fill_intra4x4_pred_table(h);
4630                 for(i=0; i<16; i+=di){
4631                     int mode= pred_intra_mode(h, i);
4632
4633                     if(!get_bits1(&s->gb)){
4634                         const int rem_mode= get_bits(&s->gb, 3);
4635                         mode = rem_mode + (rem_mode >= mode);
4636                     }
4637
4638                     if(di==4)
4639                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4640                     else
4641                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4642                 }
4643                 write_back_intra_pred_mode(h);
4644                 if( check_intra4x4_pred_mode(h) < 0)
4645                     return -1;
4646             }else{
4647                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4648                 if(h->intra16x16_pred_mode < 0)
4649                     return -1;
4650             }
4651
4652             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4653             if(pred_mode < 0)
4654                 return -1;
4655             h->chroma_pred_mode= pred_mode;
4656     }else if(partition_count==4){
4657         int i, j, sub_partition_count[4], list, ref[2][4];
4658
4659         if(h->slice_type == FF_B_TYPE){
4660             for(i=0; i<4; i++){
4661                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4662                 if(h->sub_mb_type[i] >=13){
4663                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4664                     return -1;
4665                 }
4666                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4667                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4668             }
4669             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4670                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4671                 pred_direct_motion(h, &mb_type);
4672                 h->ref_cache[0][scan8[4]] =
4673                 h->ref_cache[1][scan8[4]] =
4674                 h->ref_cache[0][scan8[12]] =
4675                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4676             }
4677         }else{
4678             assert(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE); //FIXME SP correct ?
4679             for(i=0; i<4; i++){
4680                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4681                 if(h->sub_mb_type[i] >=4){
4682                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4683                     return -1;
4684                 }
4685                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4686                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4687             }
4688         }
4689
4690         for(list=0; list<h->list_count; list++){
4691             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4692             for(i=0; i<4; i++){
4693                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4694                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4695                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4696                     if(tmp>=ref_count){
4697                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4698                         return -1;
4699                     }
4700                     ref[list][i]= tmp;
4701                 }else{
4702                  //FIXME
4703                     ref[list][i] = -1;
4704                 }
4705             }
4706         }
4707
4708         if(dct8x8_allowed)
4709             dct8x8_allowed = get_dct8x8_allowed(h);
4710
4711         for(list=0; list<h->list_count; list++){
4712             for(i=0; i<4; i++){
4713                 if(IS_DIRECT(h->sub_mb_type[i])) {
4714                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4715                     continue;
4716                 }
4717                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4718                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4719
4720                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4721                     const int sub_mb_type= h->sub_mb_type[i];
4722                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4723                     for(j=0; j<sub_partition_count[i]; j++){
4724                         int mx, my;
4725                         const int index= 4*i + block_width*j;
4726                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4727                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4728                         mx += get_se_golomb(&s->gb);
4729                         my += get_se_golomb(&s->gb);
4730                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4731
4732                         if(IS_SUB_8X8(sub_mb_type)){
4733                             mv_cache[ 1 ][0]=
4734                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4735                             mv_cache[ 1 ][1]=
4736                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4737                         }else if(IS_SUB_8X4(sub_mb_type)){
4738                             mv_cache[ 1 ][0]= mx;
4739                             mv_cache[ 1 ][1]= my;
4740                         }else if(IS_SUB_4X8(sub_mb_type)){
4741                             mv_cache[ 8 ][0]= mx;
4742                             mv_cache[ 8 ][1]= my;
4743                         }
4744                         mv_cache[ 0 ][0]= mx;
4745                         mv_cache[ 0 ][1]= my;
4746                     }
4747                 }else{
4748                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4749                     p[0] = p[1]=
4750                     p[8] = p[9]= 0;
4751                 }
4752             }
4753         }
4754     }else if(IS_DIRECT(mb_type)){
4755         pred_direct_motion(h, &mb_type);
4756         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4757     }else{
4758         int list, mx, my, i;
4759          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4760         if(IS_16X16(mb_type)){
4761             for(list=0; list<h->list_count; list++){
4762                     unsigned int val;
4763                     if(IS_DIR(mb_type, 0, list)){
4764                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4765                         if(val >= h->ref_count[list]){
4766                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4767                             return -1;
4768                         }
4769                     }else
4770                         val= LIST_NOT_USED&0xFF;
4771                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4772             }
4773             for(list=0; list<h->list_count; list++){
4774                 unsigned int val;
4775                 if(IS_DIR(mb_type, 0, list)){
4776                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4777                     mx += get_se_golomb(&s->gb);
4778                     my += get_se_golomb(&s->gb);
4779                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4780
4781                     val= pack16to32(mx,my);
4782                 }else
4783                     val=0;
4784                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4785             }
4786         }
4787         else if(IS_16X8(mb_type)){
4788             for(list=0; list<h->list_count; list++){
4789                     for(i=0; i<2; i++){
4790                         unsigned int val;
4791                         if(IS_DIR(mb_type, i, list)){
4792                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4793                             if(val >= h->ref_count[list]){
4794                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4795                                 return -1;
4796                             }
4797                         }else
4798                             val= LIST_NOT_USED&0xFF;
4799                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4800                     }
4801             }
4802             for(list=0; list<h->list_count; list++){
4803                 for(i=0; i<2; i++){
4804                     unsigned int val;
4805                     if(IS_DIR(mb_type, i, list)){
4806                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4807                         mx += get_se_golomb(&s->gb);
4808                         my += get_se_golomb(&s->gb);
4809                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4810
4811                         val= pack16to32(mx,my);
4812                     }else
4813                         val=0;
4814                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4815                 }
4816             }
4817         }else{
4818             assert(IS_8X16(mb_type));
4819             for(list=0; list<h->list_count; list++){
4820                     for(i=0; i<2; i++){
4821                         unsigned int val;
4822                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4823                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4824                             if(val >= h->ref_count[list]){
4825                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4826                                 return -1;
4827                             }
4828                         }else
4829                             val= LIST_NOT_USED&0xFF;
4830                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4831                     }
4832             }
4833             for(list=0; list<h->list_count; list++){
4834                 for(i=0; i<2; i++){
4835                     unsigned int val;
4836                     if(IS_DIR(mb_type, i, list)){
4837                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4838                         mx += get_se_golomb(&s->gb);
4839                         my += get_se_golomb(&s->gb);
4840                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4841
4842                         val= pack16to32(mx,my);
4843                     }else
4844                         val=0;
4845                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4846                 }
4847             }
4848         }
4849     }
4850
4851     if(IS_INTER(mb_type))
4852         write_back_motion(h, mb_type);
4853
4854     if(!IS_INTRA16x16(mb_type)){
4855         cbp= get_ue_golomb(&s->gb);
4856         if(cbp > 47){
4857             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4858             return -1;
4859         }
4860
4861         if(IS_INTRA4x4(mb_type))
4862             cbp= golomb_to_intra4x4_cbp[cbp];
4863         else
4864             cbp= golomb_to_inter_cbp[cbp];
4865     }
4866     h->cbp = cbp;
4867
4868     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4869         if(get_bits1(&s->gb))
4870             mb_type |= MB_TYPE_8x8DCT;
4871     }
4872     s->current_picture.mb_type[mb_xy]= mb_type;
4873
4874     if(cbp || IS_INTRA16x16(mb_type)){
4875         int i8x8, i4x4, chroma_idx;
4876         int dquant;
4877         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4878         const uint8_t *scan, *scan8x8, *dc_scan;
4879
4880 //        fill_non_zero_count_cache(h);
4881
4882         if(IS_INTERLACED(mb_type)){
4883             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4884             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4885             dc_scan= luma_dc_field_scan;
4886         }else{
4887             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4888             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4889             dc_scan= luma_dc_zigzag_scan;
4890         }
4891
4892         dquant= get_se_golomb(&s->gb);
4893
4894         if( dquant > 25 || dquant < -26 ){
4895             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4896             return -1;
4897         }
4898
4899         s->qscale += dquant;
4900         if(((unsigned)s->qscale) > 51){
4901             if(s->qscale<0) s->qscale+= 52;
4902             else            s->qscale-= 52;
4903         }
4904
4905         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4906         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4907         if(IS_INTRA16x16(mb_type)){
4908             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4909                 return -1; //FIXME continue if partitioned and other return -1 too
4910             }
4911
4912             assert((cbp&15) == 0 || (cbp&15) == 15);
4913
4914             if(cbp&15){
4915                 for(i8x8=0; i8x8<4; i8x8++){
4916                     for(i4x4=0; i4x4<4; i4x4++){
4917                         const int index= i4x4 + 4*i8x8;
4918                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4919                             return -1;
4920                         }
4921                     }
4922                 }
4923             }else{
4924                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4925             }
4926         }else{
4927             for(i8x8=0; i8x8<4; i8x8++){
4928                 if(cbp & (1<<i8x8)){
4929                     if(IS_8x8DCT(mb_type)){
4930                         DCTELEM *buf = &h->mb[64*i8x8];
4931                         uint8_t *nnz;
4932                         for(i4x4=0; i4x4<4; i4x4++){
4933                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4934                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4935                                 return -1;
4936                         }
4937                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4938                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4939                     }else{
4940                         for(i4x4=0; i4x4<4; i4x4++){
4941                             const int index= i4x4 + 4*i8x8;
4942
4943                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4944                                 return -1;
4945                             }
4946                         }
4947                     }
4948                 }else{
4949                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4950                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4951                 }
4952             }
4953         }
4954
4955         if(cbp&0x30){
4956             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4957                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4958                     return -1;
4959                 }
4960         }
4961
4962         if(cbp&0x20){
4963             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4964                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4965                 for(i4x4=0; i4x4<4; i4x4++){
4966                     const int index= 16 + 4*chroma_idx + i4x4;
4967                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4968                         return -1;
4969                     }
4970                 }
4971             }
4972         }else{
4973             uint8_t * const nnz= &h->non_zero_count_cache[0];
4974             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4975             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4976         }
4977     }else{
4978         uint8_t * const nnz= &h->non_zero_count_cache[0];
4979         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4980         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4981         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4982     }
4983     s->current_picture.qscale_table[mb_xy]= s->qscale;
4984     write_back_non_zero_count(h);
4985
4986     if(MB_MBAFF){
4987         h->ref_count[0] >>= 1;
4988         h->ref_count[1] >>= 1;
4989     }
4990
4991     return 0;
4992 }
4993
4994 static int decode_cabac_field_decoding_flag(H264Context *h) {
4995     MpegEncContext * const s = &h->s;
4996     const int mb_x = s->mb_x;
4997     const int mb_y = s->mb_y & ~1;
4998     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4999     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5000
5001     unsigned int ctx = 0;
5002
5003     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5004         ctx += 1;
5005     }
5006     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5007         ctx += 1;
5008     }
5009
5010     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5011 }
5012
5013 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5014     uint8_t *state= &h->cabac_state[ctx_base];
5015     int mb_type;
5016
5017     if(intra_slice){
5018         MpegEncContext * const s = &h->s;
5019         const int mba_xy = h->left_mb_xy[0];
5020         const int mbb_xy = h->top_mb_xy;
5021         int ctx=0;
5022         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5023             ctx++;
5024         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5025             ctx++;
5026         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5027             return 0;   /* I4x4 */
5028         state += 2;
5029     }else{
5030         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5031             return 0;   /* I4x4 */
5032     }
5033
5034     if( get_cabac_terminate( &h->cabac ) )
5035         return 25;  /* PCM */
5036
5037     mb_type = 1; /* I16x16 */
5038     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5039     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5040         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5041     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5042     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5043     return mb_type;
5044 }
5045
5046 static int decode_cabac_mb_type( H264Context *h ) {
5047     MpegEncContext * const s = &h->s;
5048
5049     if( h->slice_type == FF_I_TYPE ) {
5050         return decode_cabac_intra_mb_type(h, 3, 1);
5051     } else if( h->slice_type == FF_P_TYPE ) {
5052         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5053             /* P-type */
5054             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5055                 /* P_L0_D16x16, P_8x8 */
5056                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5057             } else {
5058                 /* P_L0_D8x16, P_L0_D16x8 */
5059                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5060             }
5061         } else {
5062             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5063         }
5064     } else if( h->slice_type == FF_B_TYPE ) {
5065         const int mba_xy = h->left_mb_xy[0];
5066         const int mbb_xy = h->top_mb_xy;
5067         int ctx = 0;
5068         int bits;
5069
5070         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5071             ctx++;
5072         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5073             ctx++;
5074
5075         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5076             return 0; /* B_Direct_16x16 */
5077
5078         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5079             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5080         }
5081
5082         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5083         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5084         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5085         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5086         if( bits < 8 )
5087             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5088         else if( bits == 13 ) {
5089             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5090         } else if( bits == 14 )
5091             return 11; /* B_L1_L0_8x16 */
5092         else if( bits == 15 )
5093             return 22; /* B_8x8 */
5094
5095         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5096         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5097     } else {
5098         /* TODO SI/SP frames? */
5099         return -1;
5100     }
5101 }
5102
5103 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5104     MpegEncContext * const s = &h->s;
5105     int mba_xy, mbb_xy;
5106     int ctx = 0;
5107
5108     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5109         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5110         mba_xy = mb_xy - 1;
5111         if( (mb_y&1)
5112             && h->slice_table[mba_xy] == h->slice_num
5113             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5114             mba_xy += s->mb_stride;
5115         if( MB_FIELD ){
5116             mbb_xy = mb_xy - s->mb_stride;
5117             if( !(mb_y&1)
5118                 && h->slice_table[mbb_xy] == h->slice_num
5119                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5120                 mbb_xy -= s->mb_stride;
5121         }else
5122             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5123     }else{
5124         int mb_xy = h->mb_xy;
5125         mba_xy = mb_xy - 1;
5126         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5127     }
5128
5129     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5130         ctx++;
5131     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5132         ctx++;
5133
5134     if( h->slice_type == FF_B_TYPE )
5135         ctx += 13;
5136     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5137 }
5138
5139 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5140     int mode = 0;
5141
5142     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5143         return pred_mode;
5144
5145     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5146     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5147     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5148
5149     if( mode >= pred_mode )
5150         return mode + 1;
5151     else
5152         return mode;
5153 }
5154
5155 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5156     const int mba_xy = h->left_mb_xy[0];
5157     const int mbb_xy = h->top_mb_xy;
5158
5159     int ctx = 0;
5160
5161     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5162     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5163         ctx++;
5164
5165     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5166         ctx++;
5167
5168     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5169         return 0;
5170
5171     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5172         return 1;
5173     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5174         return 2;
5175     else
5176         return 3;
5177 }
5178
5179 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5180     int cbp_b, cbp_a, ctx, cbp = 0;
5181
5182     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5183     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5184
5185     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5186     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5187     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5188     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5189     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5190     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5191     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5192     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5193     return cbp;
5194 }
5195 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5196     int ctx;
5197     int cbp_a, cbp_b;
5198
5199     cbp_a = (h->left_cbp>>4)&0x03;
5200     cbp_b = (h-> top_cbp>>4)&0x03;
5201
5202     ctx = 0;
5203     if( cbp_a > 0 ) ctx++;
5204     if( cbp_b > 0 ) ctx += 2;
5205     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5206         return 0;
5207
5208     ctx = 4;
5209     if( cbp_a == 2 ) ctx++;
5210     if( cbp_b == 2 ) ctx += 2;
5211     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5212 }
5213 static int decode_cabac_mb_dqp( H264Context *h) {
5214     int   ctx = 0;
5215     int   val = 0;
5216
5217     if( h->last_qscale_diff != 0 )
5218         ctx++;
5219
5220     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5221         if( ctx < 2 )
5222             ctx = 2;
5223         else
5224             ctx = 3;
5225         val++;
5226         if(val > 102) //prevent infinite loop
5227             return INT_MIN;
5228     }
5229
5230     if( val&0x01 )
5231         return (val + 1)/2;
5232     else
5233         return -(val + 1)/2;
5234 }
5235 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5236     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5237         return 0;   /* 8x8 */
5238     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5239         return 1;   /* 8x4 */
5240     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5241         return 2;   /* 4x8 */
5242     return 3;       /* 4x4 */
5243 }
5244 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5245     int type;
5246     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5247         return 0;   /* B_Direct_8x8 */
5248     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5249         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5250     type = 3;
5251     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5252         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5253             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5254         type += 4;
5255     }
5256     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5257     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5258     return type;
5259 }
5260
5261 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5262     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5263 }
5264
5265 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5266     int refa = h->ref_cache[list][scan8[n] - 1];
5267     int refb = h->ref_cache[list][scan8[n] - 8];
5268     int ref  = 0;
5269     int ctx  = 0;
5270
5271     if( h->slice_type == FF_B_TYPE) {
5272         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5273             ctx++;
5274         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5275             ctx += 2;
5276     } else {
5277         if( refa > 0 )
5278             ctx++;
5279         if( refb > 0 )
5280             ctx += 2;
5281     }
5282
5283     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5284         ref++;
5285         if( ctx < 4 )
5286             ctx = 4;
5287         else
5288             ctx = 5;
5289         if(ref >= 32 /*h->ref_list[list]*/){
5290             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5291             return 0; //FIXME we should return -1 and check the return everywhere
5292         }
5293     }
5294     return ref;
5295 }
5296
5297 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5298     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5299                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5300     int ctxbase = (l == 0) ? 40 : 47;
5301     int ctx, mvd;
5302
5303     if( amvd < 3 )
5304         ctx = 0;
5305     else if( amvd > 32 )
5306         ctx = 2;
5307     else
5308         ctx = 1;
5309
5310     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5311         return 0;
5312
5313     mvd= 1;
5314     ctx= 3;
5315     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5316         mvd++;
5317         if( ctx < 6 )
5318             ctx++;
5319     }
5320
5321     if( mvd >= 9 ) {
5322         int k = 3;
5323         while( get_cabac_bypass( &h->cabac ) ) {
5324             mvd += 1 << k;
5325             k++;
5326             if(k>24){
5327                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5328                 return INT_MIN;
5329             }
5330         }
5331         while( k-- ) {
5332             if( get_cabac_bypass( &h->cabac ) )
5333                 mvd += 1 << k;
5334         }
5335     }
5336     return get_cabac_bypass_sign( &h->cabac, -mvd );
5337 }
5338
5339 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5340     int nza, nzb;
5341     int ctx = 0;
5342
5343     if( is_dc ) {
5344         if( cat == 0 ) {
5345             nza = h->left_cbp&0x100;
5346             nzb = h-> top_cbp&0x100;
5347         } else {
5348             nza = (h->left_cbp>>(6+idx))&0x01;
5349             nzb = (h-> top_cbp>>(6+idx))&0x01;
5350         }
5351     } else {
5352         if( cat == 4 ) {
5353             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5354             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5355         } else {
5356             assert(cat == 1 || cat == 2);
5357             nza = h->non_zero_count_cache[scan8[idx] - 1];
5358             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5359         }
5360     }
5361
5362     if( nza > 0 )
5363         ctx++;
5364
5365     if( nzb > 0 )
5366         ctx += 2;
5367
5368     return ctx + 4 * cat;
5369 }
5370
5371 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5372     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5373     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5374     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5375     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5376 };
5377
5378 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5379     static const int significant_coeff_flag_offset[2][6] = {
5380       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5381       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5382     };
5383     static const int last_coeff_flag_offset[2][6] = {
5384       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5385       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5386     };
5387     static const int coeff_abs_level_m1_offset[6] = {
5388         227+0, 227+10, 227+20, 227+30, 227+39, 426
5389     };
5390     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5391       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5392         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5393         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5394        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5395       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5396         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5397         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5398         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5399     };
5400     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5401      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5402      * map node ctx => cabac ctx for level=1 */
5403     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5404     /* map node ctx => cabac ctx for level>1 */
5405     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5406     static const uint8_t coeff_abs_level_transition[2][8] = {
5407     /* update node ctx after decoding a level=1 */
5408         { 1, 2, 3, 3, 4, 5, 6, 7 },
5409     /* update node ctx after decoding a level>1 */
5410         { 4, 4, 4, 4, 5, 6, 7, 7 }
5411     };
5412
5413     int index[64];
5414
5415     int av_unused last;
5416     int coeff_count = 0;
5417     int node_ctx = 0;
5418
5419     uint8_t *significant_coeff_ctx_base;
5420     uint8_t *last_coeff_ctx_base;
5421     uint8_t *abs_level_m1_ctx_base;
5422
5423 #ifndef ARCH_X86
5424 #define CABAC_ON_STACK
5425 #endif
5426 #ifdef CABAC_ON_STACK
5427 #define CC &cc
5428     CABACContext cc;
5429     cc.range     = h->cabac.range;
5430     cc.low       = h->cabac.low;
5431     cc.bytestream= h->cabac.bytestream;
5432 #else
5433 #define CC &h->cabac
5434 #endif
5435
5436
5437     /* cat: 0-> DC 16x16  n = 0
5438      *      1-> AC 16x16  n = luma4x4idx
5439      *      2-> Luma4x4   n = luma4x4idx
5440      *      3-> DC Chroma n = iCbCr
5441      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5442      *      5-> Luma8x8   n = 4 * luma8x8idx
5443      */
5444
5445     /* read coded block flag */
5446     if( is_dc || cat != 5 ) {
5447         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5448             if( !is_dc ) {
5449                 if( cat == 4 )
5450                     h->non_zero_count_cache[scan8[16+n]] = 0;
5451                 else
5452                     h->non_zero_count_cache[scan8[n]] = 0;
5453             }
5454
5455 #ifdef CABAC_ON_STACK
5456             h->cabac.range     = cc.range     ;
5457             h->cabac.low       = cc.low       ;
5458             h->cabac.bytestream= cc.bytestream;
5459 #endif
5460             return;
5461         }
5462     }
5463
5464     significant_coeff_ctx_base = h->cabac_state
5465         + significant_coeff_flag_offset[MB_FIELD][cat];
5466     last_coeff_ctx_base = h->cabac_state
5467         + last_coeff_flag_offset[MB_FIELD][cat];
5468     abs_level_m1_ctx_base = h->cabac_state
5469         + coeff_abs_level_m1_offset[cat];
5470
5471     if( !is_dc && cat == 5 ) {
5472 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5473         for(last= 0; last < coefs; last++) { \
5474             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5475             if( get_cabac( CC, sig_ctx )) { \
5476                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5477                 index[coeff_count++] = last; \
5478                 if( get_cabac( CC, last_ctx ) ) { \
5479                     last= max_coeff; \
5480                     break; \
5481                 } \
5482             } \
5483         }\
5484         if( last == max_coeff -1 ) {\
5485             index[coeff_count++] = last;\
5486         }
5487         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5488 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5489         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5490     } else {
5491         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5492 #else
5493         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5494     } else {
5495         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5496 #endif
5497     }
5498     assert(coeff_count > 0);
5499
5500     if( is_dc ) {
5501         if( cat == 0 )
5502             h->cbp_table[h->mb_xy] |= 0x100;
5503         else
5504             h->cbp_table[h->mb_xy] |= 0x40 << n;
5505     } else {
5506         if( cat == 5 )
5507             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5508         else if( cat == 4 )
5509             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5510         else {
5511             assert( cat == 1 || cat == 2 );
5512             h->non_zero_count_cache[scan8[n]] = coeff_count;
5513         }
5514     }
5515
5516     while( coeff_count-- ) {
5517         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5518
5519         int j= scantable[index[coeff_count]];
5520
5521         if( get_cabac( CC, ctx ) == 0 ) {
5522             node_ctx = coeff_abs_level_transition[0][node_ctx];
5523             if( is_dc ) {
5524                 block[j] = get_cabac_bypass_sign( CC, -1);
5525             }else{
5526                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5527             }
5528         } else {
5529             int coeff_abs = 2;
5530             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5531             node_ctx = coeff_abs_level_transition[1][node_ctx];
5532
5533             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5534                 coeff_abs++;
5535             }
5536
5537             if( coeff_abs >= 15 ) {
5538                 int j = 0;
5539                 while( get_cabac_bypass( CC ) ) {
5540                     j++;
5541                 }
5542
5543                 coeff_abs=1;
5544                 while( j-- ) {
5545                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5546                 }
5547                 coeff_abs+= 14;
5548             }
5549
5550             if( is_dc ) {
5551                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5552             }else{
5553                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5554             }
5555         }
5556     }
5557 #ifdef CABAC_ON_STACK
5558             h->cabac.range     = cc.range     ;
5559             h->cabac.low       = cc.low       ;
5560             h->cabac.bytestream= cc.bytestream;
5561 #endif
5562
5563 }
5564
5565 #ifndef CONFIG_SMALL
5566 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5567     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5568 }
5569
5570 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5571     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5572 }
5573 #endif
5574
5575 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5576 #ifdef CONFIG_SMALL
5577     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5578 #else
5579     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5580     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5581 #endif
5582 }
5583
5584 static inline void compute_mb_neighbors(H264Context *h)
5585 {
5586     MpegEncContext * const s = &h->s;
5587     const int mb_xy  = h->mb_xy;
5588     h->top_mb_xy     = mb_xy - s->mb_stride;
5589     h->left_mb_xy[0] = mb_xy - 1;
5590     if(FRAME_MBAFF){
5591         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5592         const int top_pair_xy      = pair_xy     - s->mb_stride;
5593         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5594         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5595         const int curr_mb_frame_flag = !MB_FIELD;
5596         const int bottom = (s->mb_y & 1);
5597         if (bottom
5598                 ? !curr_mb_frame_flag // bottom macroblock
5599                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5600                 ) {
5601             h->top_mb_xy -= s->mb_stride;
5602         }
5603         if (left_mb_frame_flag != curr_mb_frame_flag) {
5604             h->left_mb_xy[0] = pair_xy - 1;
5605         }
5606     } else if (FIELD_PICTURE) {
5607         h->top_mb_xy -= s->mb_stride;
5608     }
5609     return;
5610 }
5611
5612 /**
5613  * decodes a macroblock
5614  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5615  */
5616 static int decode_mb_cabac(H264Context *h) {
5617     MpegEncContext * const s = &h->s;
5618     int mb_xy;
5619     int mb_type, partition_count, cbp = 0;
5620     int dct8x8_allowed= h->pps.transform_8x8_mode;
5621
5622     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5623
5624     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5625
5626     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5627     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE ) {
5628         int skip;
5629         /* a skipped mb needs the aff flag from the following mb */
5630         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5631             predict_field_decoding_flag(h);
5632         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5633             skip = h->next_mb_skipped;
5634         else
5635             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5636         /* read skip flags */
5637         if( skip ) {
5638             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5639                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5640                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5641                 if(h->next_mb_skipped)
5642                     predict_field_decoding_flag(h);
5643                 else
5644                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5645             }
5646
5647             decode_mb_skip(h);
5648
5649             h->cbp_table[mb_xy] = 0;
5650             h->chroma_pred_mode_table[mb_xy] = 0;
5651             h->last_qscale_diff = 0;
5652
5653             return 0;
5654
5655         }
5656     }
5657     if(FRAME_MBAFF){
5658         if( (s->mb_y&1) == 0 )
5659             h->mb_mbaff =
5660             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5661     }else
5662         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5663
5664     h->prev_mb_skipped = 0;
5665
5666     compute_mb_neighbors(h);
5667     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5668         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5669         return -1;
5670     }
5671
5672     if( h->slice_type == FF_B_TYPE ) {
5673         if( mb_type < 23 ){
5674             partition_count= b_mb_type_info[mb_type].partition_count;
5675             mb_type=         b_mb_type_info[mb_type].type;
5676         }else{
5677             mb_type -= 23;
5678             goto decode_intra_mb;
5679         }
5680     } else if( h->slice_type == FF_P_TYPE ) {
5681         if( mb_type < 5) {
5682             partition_count= p_mb_type_info[mb_type].partition_count;
5683             mb_type=         p_mb_type_info[mb_type].type;
5684         } else {
5685             mb_type -= 5;
5686             goto decode_intra_mb;
5687         }
5688     } else {
5689        assert(h->slice_type == FF_I_TYPE);
5690 decode_intra_mb:
5691         partition_count = 0;
5692         cbp= i_mb_type_info[mb_type].cbp;
5693         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5694         mb_type= i_mb_type_info[mb_type].type;
5695     }
5696     if(MB_FIELD)
5697         mb_type |= MB_TYPE_INTERLACED;
5698
5699     h->slice_table[ mb_xy ]= h->slice_num;
5700
5701     if(IS_INTRA_PCM(mb_type)) {
5702         const uint8_t *ptr;
5703         unsigned int x, y;
5704
5705         // We assume these blocks are very rare so we do not optimize it.
5706         // FIXME The two following lines get the bitstream position in the cabac
5707         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5708         ptr= h->cabac.bytestream;
5709         if(h->cabac.low&0x1) ptr--;
5710         if(CABAC_BITS==16){
5711             if(h->cabac.low&0x1FF) ptr--;
5712         }
5713
5714         // The pixels are stored in the same order as levels in h->mb array.
5715         for(y=0; y<16; y++){
5716             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5717             for(x=0; x<16; x++){
5718                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5719                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5720             }
5721         }
5722         for(y=0; y<8; y++){
5723             const int index= 256 + 4*(y&3) + 32*(y>>2);
5724             for(x=0; x<8; x++){
5725                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5726                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5727             }
5728         }
5729         for(y=0; y<8; y++){
5730             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5731             for(x=0; x<8; x++){
5732                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5733                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5734             }
5735         }
5736
5737         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5738
5739         // All blocks are present
5740         h->cbp_table[mb_xy] = 0x1ef;
5741         h->chroma_pred_mode_table[mb_xy] = 0;
5742         // In deblocking, the quantizer is 0
5743         s->current_picture.qscale_table[mb_xy]= 0;
5744         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5745         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5746         // All coeffs are present
5747         memset(h->non_zero_count[mb_xy], 16, 16);
5748         s->current_picture.mb_type[mb_xy]= mb_type;
5749         h->last_qscale_diff = 0;
5750         return 0;
5751     }
5752
5753     if(MB_MBAFF){
5754         h->ref_count[0] <<= 1;
5755         h->ref_count[1] <<= 1;
5756     }
5757
5758     fill_caches(h, mb_type, 0);
5759
5760     if( IS_INTRA( mb_type ) ) {
5761         int i, pred_mode;
5762         if( IS_INTRA4x4( mb_type ) ) {
5763             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5764                 mb_type |= MB_TYPE_8x8DCT;
5765                 for( i = 0; i < 16; i+=4 ) {
5766                     int pred = pred_intra_mode( h, i );
5767                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5768                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5769                 }
5770             } else {
5771                 for( i = 0; i < 16; i++ ) {
5772                     int pred = pred_intra_mode( h, i );
5773                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5774
5775                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5776                 }
5777             }
5778             write_back_intra_pred_mode(h);
5779             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5780         } else {
5781             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5782             if( h->intra16x16_pred_mode < 0 ) return -1;
5783         }
5784         h->chroma_pred_mode_table[mb_xy] =
5785         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5786
5787         pred_mode= check_intra_pred_mode( h, pred_mode );
5788         if( pred_mode < 0 ) return -1;
5789         h->chroma_pred_mode= pred_mode;
5790     } else if( partition_count == 4 ) {
5791         int i, j, sub_partition_count[4], list, ref[2][4];
5792
5793         if( h->slice_type == FF_B_TYPE ) {
5794             for( i = 0; i < 4; i++ ) {
5795                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5796                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5797                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5798             }
5799             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5800                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5801                 pred_direct_motion(h, &mb_type);
5802                 h->ref_cache[0][scan8[4]] =
5803                 h->ref_cache[1][scan8[4]] =
5804                 h->ref_cache[0][scan8[12]] =
5805                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5806                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5807                     for( i = 0; i < 4; i++ )
5808                         if( IS_DIRECT(h->sub_mb_type[i]) )
5809                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5810                 }
5811             }
5812         } else {
5813             for( i = 0; i < 4; i++ ) {
5814                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5815                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5816                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5817             }
5818         }
5819
5820         for( list = 0; list < h->list_count; list++ ) {
5821                 for( i = 0; i < 4; i++ ) {
5822                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5823                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5824                         if( h->ref_count[list] > 1 )
5825                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5826                         else
5827                             ref[list][i] = 0;
5828                     } else {
5829                         ref[list][i] = -1;
5830                     }
5831                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5832                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5833                 }
5834         }
5835
5836         if(dct8x8_allowed)
5837             dct8x8_allowed = get_dct8x8_allowed(h);
5838
5839         for(list=0; list<h->list_count; list++){
5840             for(i=0; i<4; i++){
5841                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5842                 if(IS_DIRECT(h->sub_mb_type[i])){
5843                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5844                     continue;
5845                 }
5846
5847                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5848                     const int sub_mb_type= h->sub_mb_type[i];
5849                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5850                     for(j=0; j<sub_partition_count[i]; j++){
5851                         int mpx, mpy;
5852                         int mx, my;
5853                         const int index= 4*i + block_width*j;
5854                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5855                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5856                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5857
5858                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5859                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5860                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5861
5862                         if(IS_SUB_8X8(sub_mb_type)){
5863                             mv_cache[ 1 ][0]=
5864                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5865                             mv_cache[ 1 ][1]=
5866                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5867
5868                             mvd_cache[ 1 ][0]=
5869                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5870                             mvd_cache[ 1 ][1]=
5871                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5872                         }else if(IS_SUB_8X4(sub_mb_type)){
5873                             mv_cache[ 1 ][0]= mx;
5874                             mv_cache[ 1 ][1]= my;
5875
5876                             mvd_cache[ 1 ][0]= mx - mpx;
5877                             mvd_cache[ 1 ][1]= my - mpy;
5878                         }else if(IS_SUB_4X8(sub_mb_type)){
5879                             mv_cache[ 8 ][0]= mx;
5880                             mv_cache[ 8 ][1]= my;
5881
5882                             mvd_cache[ 8 ][0]= mx - mpx;
5883                             mvd_cache[ 8 ][1]= my - mpy;
5884                         }
5885                         mv_cache[ 0 ][0]= mx;
5886                         mv_cache[ 0 ][1]= my;
5887
5888                         mvd_cache[ 0 ][0]= mx - mpx;
5889                         mvd_cache[ 0 ][1]= my - mpy;
5890                     }
5891                 }else{
5892                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5893                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5894                     p[0] = p[1] = p[8] = p[9] = 0;
5895                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5896                 }
5897             }
5898         }
5899     } else if( IS_DIRECT(mb_type) ) {
5900         pred_direct_motion(h, &mb_type);
5901         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5902         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5903         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5904     } else {
5905         int list, mx, my, i, mpx, mpy;
5906         if(IS_16X16(mb_type)){
5907             for(list=0; list<h->list_count; list++){
5908                 if(IS_DIR(mb_type, 0, list)){
5909                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5910                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5911                 }else
5912                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5913             }
5914             for(list=0; list<h->list_count; list++){
5915                 if(IS_DIR(mb_type, 0, list)){
5916                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5917
5918                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5919                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5920                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5921
5922                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5923                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5924                 }else
5925                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5926             }
5927         }
5928         else if(IS_16X8(mb_type)){
5929             for(list=0; list<h->list_count; list++){
5930                     for(i=0; i<2; i++){
5931                         if(IS_DIR(mb_type, i, list)){
5932                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5933                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5934                         }else
5935                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5936                     }
5937             }
5938             for(list=0; list<h->list_count; list++){
5939                 for(i=0; i<2; i++){
5940                     if(IS_DIR(mb_type, i, list)){
5941                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5942                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5943                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5944                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5945
5946                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5947                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5948                     }else{
5949                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5950                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5951                     }
5952                 }
5953             }
5954         }else{
5955             assert(IS_8X16(mb_type));
5956             for(list=0; list<h->list_count; list++){
5957                     for(i=0; i<2; i++){
5958                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5959                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5960                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5961                         }else
5962                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5963                     }
5964             }
5965             for(list=0; list<h->list_count; list++){
5966                 for(i=0; i<2; i++){
5967                     if(IS_DIR(mb_type, i, list)){
5968                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5969                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5970                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5971
5972                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5973                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5974                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5975                     }else{
5976                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5977                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5978                     }
5979                 }
5980             }
5981         }
5982     }
5983
5984    if( IS_INTER( mb_type ) ) {
5985         h->chroma_pred_mode_table[mb_xy] = 0;
5986         write_back_motion( h, mb_type );
5987    }
5988
5989     if( !IS_INTRA16x16( mb_type ) ) {
5990         cbp  = decode_cabac_mb_cbp_luma( h );
5991         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5992     }
5993
5994     h->cbp_table[mb_xy] = h->cbp = cbp;
5995
5996     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5997         if( decode_cabac_mb_transform_size( h ) )
5998             mb_type |= MB_TYPE_8x8DCT;
5999     }
6000     s->current_picture.mb_type[mb_xy]= mb_type;
6001
6002     if( cbp || IS_INTRA16x16( mb_type ) ) {
6003         const uint8_t *scan, *scan8x8, *dc_scan;
6004         const uint32_t *qmul;
6005         int dqp;
6006
6007         if(IS_INTERLACED(mb_type)){
6008             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6009             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6010             dc_scan= luma_dc_field_scan;
6011         }else{
6012             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6013             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6014             dc_scan= luma_dc_zigzag_scan;
6015         }
6016
6017         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6018         if( dqp == INT_MIN ){
6019             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6020             return -1;
6021         }
6022         s->qscale += dqp;
6023         if(((unsigned)s->qscale) > 51){
6024             if(s->qscale<0) s->qscale+= 52;
6025             else            s->qscale-= 52;
6026         }
6027         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6028         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6029
6030         if( IS_INTRA16x16( mb_type ) ) {
6031             int i;
6032             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6033             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6034
6035             if( cbp&15 ) {
6036                 qmul = h->dequant4_coeff[0][s->qscale];
6037                 for( i = 0; i < 16; i++ ) {
6038                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6039                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6040                 }
6041             } else {
6042                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6043             }
6044         } else {
6045             int i8x8, i4x4;
6046             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6047                 if( cbp & (1<<i8x8) ) {
6048                     if( IS_8x8DCT(mb_type) ) {
6049                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6050                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6051                     } else {
6052                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6053                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6054                             const int index = 4*i8x8 + i4x4;
6055                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6056 //START_TIMER
6057                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6058 //STOP_TIMER("decode_residual")
6059                         }
6060                     }
6061                 } else {
6062                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6063                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6064                 }
6065             }
6066         }
6067
6068         if( cbp&0x30 ){
6069             int c;
6070             for( c = 0; c < 2; c++ ) {
6071                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6072                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6073             }
6074         }
6075
6076         if( cbp&0x20 ) {
6077             int c, i;
6078             for( c = 0; c < 2; c++ ) {
6079                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6080                 for( i = 0; i < 4; i++ ) {
6081                     const int index = 16 + 4 * c + i;
6082                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6083                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6084                 }
6085             }
6086         } else {
6087             uint8_t * const nnz= &h->non_zero_count_cache[0];
6088             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6089             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6090         }
6091     } else {
6092         uint8_t * const nnz= &h->non_zero_count_cache[0];
6093         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6094         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6095         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6096         h->last_qscale_diff = 0;
6097     }
6098
6099     s->current_picture.qscale_table[mb_xy]= s->qscale;
6100     write_back_non_zero_count(h);
6101
6102     if(MB_MBAFF){
6103         h->ref_count[0] >>= 1;
6104         h->ref_count[1] >>= 1;
6105     }
6106
6107     return 0;
6108 }
6109
6110
6111 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6112     int i, d;
6113     const int index_a = qp + h->slice_alpha_c0_offset;
6114     const int alpha = (alpha_table+52)[index_a];
6115     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6116
6117     if( bS[0] < 4 ) {
6118         int8_t tc[4];
6119         for(i=0; i<4; i++)
6120             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6121         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6122     } else {
6123         /* 16px edge length, because bS=4 is triggered by being at
6124          * the edge of an intra MB, so all 4 bS are the same */
6125             for( d = 0; d < 16; d++ ) {
6126                 const int p0 = pix[-1];
6127                 const int p1 = pix[-2];
6128                 const int p2 = pix[-3];
6129
6130                 const int q0 = pix[0];
6131                 const int q1 = pix[1];
6132                 const int q2 = pix[2];
6133
6134                 if( FFABS( p0 - q0 ) < alpha &&
6135                     FFABS( p1 - p0 ) < beta &&
6136                     FFABS( q1 - q0 ) < beta ) {
6137
6138                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6139                         if( FFABS( p2 - p0 ) < beta)
6140                         {
6141                             const int p3 = pix[-4];
6142                             /* p0', p1', p2' */
6143                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6144                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6145                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6146                         } else {
6147                             /* p0' */
6148                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6149                         }
6150                         if( FFABS( q2 - q0 ) < beta)
6151                         {
6152                             const int q3 = pix[3];
6153                             /* q0', q1', q2' */
6154                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6155                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6156                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6157                         } else {
6158                             /* q0' */
6159                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6160                         }
6161                     }else{
6162                         /* p0', q0' */
6163                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6164                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6165                     }
6166                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6167                 }
6168                 pix += stride;
6169             }
6170     }
6171 }
6172 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6173     int i;
6174     const int index_a = qp + h->slice_alpha_c0_offset;
6175     const int alpha = (alpha_table+52)[index_a];
6176     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6177
6178     if( bS[0] < 4 ) {
6179         int8_t tc[4];
6180         for(i=0; i<4; i++)
6181             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6182         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6183     } else {
6184         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6185     }
6186 }
6187
6188 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6189     int i;
6190     for( i = 0; i < 16; i++, pix += stride) {
6191         int index_a;
6192         int alpha;
6193         int beta;
6194
6195         int qp_index;
6196         int bS_index = (i >> 1);
6197         if (!MB_FIELD) {
6198             bS_index &= ~1;
6199             bS_index |= (i & 1);
6200         }
6201
6202         if( bS[bS_index] == 0 ) {
6203             continue;
6204         }
6205
6206         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6207         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6208         alpha = (alpha_table+52)[index_a];
6209         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6210
6211         if( bS[bS_index] < 4 ) {
6212             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6213             const int p0 = pix[-1];
6214             const int p1 = pix[-2];
6215             const int p2 = pix[-3];
6216             const int q0 = pix[0];
6217             const int q1 = pix[1];
6218             const int q2 = pix[2];
6219
6220             if( FFABS( p0 - q0 ) < alpha &&
6221                 FFABS( p1 - p0 ) < beta &&
6222                 FFABS( q1 - q0 ) < beta ) {
6223                 int tc = tc0;
6224                 int i_delta;
6225
6226                 if( FFABS( p2 - p0 ) < beta ) {
6227                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6228                     tc++;
6229                 }
6230                 if( FFABS( q2 - q0 ) < beta ) {
6231                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6232                     tc++;
6233                 }
6234
6235                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6236                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6237                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6238                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6239             }
6240         }else{
6241             const int p0 = pix[-1];
6242             const int p1 = pix[-2];
6243             const int p2 = pix[-3];
6244
6245             const int q0 = pix[0];
6246             const int q1 = pix[1];
6247             const int q2 = pix[2];
6248
6249             if( FFABS( p0 - q0 ) < alpha &&
6250                 FFABS( p1 - p0 ) < beta &&
6251                 FFABS( q1 - q0 ) < beta ) {
6252
6253                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6254                     if( FFABS( p2 - p0 ) < beta)
6255                     {
6256                         const int p3 = pix[-4];
6257                         /* p0', p1', p2' */
6258                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6259                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6260                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6261                     } else {
6262                         /* p0' */
6263                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6264                     }
6265                     if( FFABS( q2 - q0 ) < beta)
6266                     {
6267                         const int q3 = pix[3];
6268                         /* q0', q1', q2' */
6269                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6270                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6271                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6272                     } else {
6273                         /* q0' */
6274                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6275                     }
6276                 }else{
6277                     /* p0', q0' */
6278                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6279                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6280                 }
6281                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6282             }
6283         }
6284     }
6285 }
6286 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6287     int i;
6288     for( i = 0; i < 8; i++, pix += stride) {
6289         int index_a;
6290         int alpha;
6291         int beta;
6292
6293         int qp_index;
6294         int bS_index = i;
6295
6296         if( bS[bS_index] == 0 ) {
6297             continue;
6298         }
6299
6300         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6301         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6302         alpha = (alpha_table+52)[index_a];
6303         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6304
6305         if( bS[bS_index] < 4 ) {
6306             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6307             const int p0 = pix[-1];
6308             const int p1 = pix[-2];
6309             const int q0 = pix[0];
6310             const int q1 = pix[1];
6311
6312             if( FFABS( p0 - q0 ) < alpha &&
6313                 FFABS( p1 - p0 ) < beta &&
6314                 FFABS( q1 - q0 ) < beta ) {
6315                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6316
6317                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6318                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6319                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6320             }
6321         }else{
6322             const int p0 = pix[-1];
6323             const int p1 = pix[-2];
6324             const int q0 = pix[0];
6325             const int q1 = pix[1];
6326
6327             if( FFABS( p0 - q0 ) < alpha &&
6328                 FFABS( p1 - p0 ) < beta &&
6329                 FFABS( q1 - q0 ) < beta ) {
6330
6331                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6332                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6333                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6334             }
6335         }
6336     }
6337 }
6338
6339 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6340     int i, d;
6341     const int index_a = qp + h->slice_alpha_c0_offset;
6342     const int alpha = (alpha_table+52)[index_a];
6343     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6344     const int pix_next  = stride;
6345
6346     if( bS[0] < 4 ) {
6347         int8_t tc[4];
6348         for(i=0; i<4; i++)
6349             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6350         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6351     } else {
6352         /* 16px edge length, see filter_mb_edgev */
6353             for( d = 0; d < 16; d++ ) {
6354                 const int p0 = pix[-1*pix_next];
6355                 const int p1 = pix[-2*pix_next];
6356                 const int p2 = pix[-3*pix_next];
6357                 const int q0 = pix[0];
6358                 const int q1 = pix[1*pix_next];
6359                 const int q2 = pix[2*pix_next];
6360
6361                 if( FFABS( p0 - q0 ) < alpha &&
6362                     FFABS( p1 - p0 ) < beta &&
6363                     FFABS( q1 - q0 ) < beta ) {
6364
6365                     const int p3 = pix[-4*pix_next];
6366                     const int q3 = pix[ 3*pix_next];
6367
6368                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6369                         if( FFABS( p2 - p0 ) < beta) {
6370                             /* p0', p1', p2' */
6371                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6372                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6373                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6374                         } else {
6375                             /* p0' */
6376                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6377                         }
6378                         if( FFABS( q2 - q0 ) < beta) {
6379                             /* q0', q1', q2' */
6380                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6381                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6382                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6383                         } else {
6384                             /* q0' */
6385                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6386                         }
6387                     }else{
6388                         /* p0', q0' */
6389                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6390                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6391                     }
6392                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6393                 }
6394                 pix++;
6395             }
6396     }
6397 }
6398
6399 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6400     int i;
6401     const int index_a = qp + h->slice_alpha_c0_offset;
6402     const int alpha = (alpha_table+52)[index_a];
6403     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6404
6405     if( bS[0] < 4 ) {
6406         int8_t tc[4];
6407         for(i=0; i<4; i++)
6408             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6409         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6410     } else {
6411         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6412     }
6413 }
6414
6415 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6416     MpegEncContext * const s = &h->s;
6417     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6418     int mb_xy, mb_type;
6419     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6420
6421     mb_xy = h->mb_xy;
6422
6423     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6424        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6425                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6426         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6427         return;
6428     }
6429     assert(!FRAME_MBAFF);
6430
6431     mb_type = s->current_picture.mb_type[mb_xy];
6432     qp = s->current_picture.qscale_table[mb_xy];
6433     qp0 = s->current_picture.qscale_table[mb_xy-1];
6434     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6435     qpc = get_chroma_qp( h, 0, qp );
6436     qpc0 = get_chroma_qp( h, 0, qp0 );
6437     qpc1 = get_chroma_qp( h, 0, qp1 );
6438     qp0 = (qp + qp0 + 1) >> 1;
6439     qp1 = (qp + qp1 + 1) >> 1;
6440     qpc0 = (qpc + qpc0 + 1) >> 1;
6441     qpc1 = (qpc + qpc1 + 1) >> 1;
6442     qp_thresh = 15 - h->slice_alpha_c0_offset;
6443     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6444        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6445         return;
6446
6447     if( IS_INTRA(mb_type) ) {
6448         int16_t bS4[4] = {4,4,4,4};
6449         int16_t bS3[4] = {3,3,3,3};
6450         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6451         if( IS_8x8DCT(mb_type) ) {
6452             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6453             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6454             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6455             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6456         } else {
6457             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6458             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6459             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6460             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6461             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6462             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6463             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6464             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6465         }
6466         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6467         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6468         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6469         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6470         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6471         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6472         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6473         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6474         return;
6475     } else {
6476         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6477         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6478         int edges;
6479         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6480             edges = 4;
6481             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6482         } else {
6483             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6484                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6485             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6486                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6487                              ? 3 : 0;
6488             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6489             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6490             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6491                                               (h->slice_type == FF_B_TYPE), edges, step, mask_edge0, mask_edge1 );
6492         }
6493         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6494             bSv[0][0] = 0x0004000400040004ULL;
6495         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6496             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6497
6498 #define FILTER(hv,dir,edge)\
6499         if(bSv[dir][edge]) {\
6500             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6501             if(!(edge&1)) {\
6502                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6503                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6504             }\
6505         }
6506         if( edges == 1 ) {
6507             FILTER(v,0,0);
6508             FILTER(h,1,0);
6509         } else if( IS_8x8DCT(mb_type) ) {
6510             FILTER(v,0,0);
6511             FILTER(v,0,2);
6512             FILTER(h,1,0);
6513             FILTER(h,1,2);
6514         } else {
6515             FILTER(v,0,0);
6516             FILTER(v,0,1);
6517             FILTER(v,0,2);
6518             FILTER(v,0,3);
6519             FILTER(h,1,0);
6520             FILTER(h,1,1);
6521             FILTER(h,1,2);
6522             FILTER(h,1,3);
6523         }
6524 #undef FILTER
6525     }
6526 }
6527
6528 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6529     MpegEncContext * const s = &h->s;
6530     const int mb_xy= mb_x + mb_y*s->mb_stride;
6531     const int mb_type = s->current_picture.mb_type[mb_xy];
6532     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6533     int first_vertical_edge_done = 0;
6534     int dir;
6535     /* FIXME: A given frame may occupy more than one position in
6536      * the reference list. So ref2frm should be populated with
6537      * frame numbers, not indexes. */
6538     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6539                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6540
6541     //for sufficiently low qp, filtering wouldn't do anything
6542     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6543     if(!FRAME_MBAFF){
6544         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6545         int qp = s->current_picture.qscale_table[mb_xy];
6546         if(qp <= qp_thresh
6547            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6548            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6549             return;
6550         }
6551     }
6552
6553     if (FRAME_MBAFF
6554             // left mb is in picture
6555             && h->slice_table[mb_xy-1] != 255
6556             // and current and left pair do not have the same interlaced type
6557             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6558             // and left mb is in the same slice if deblocking_filter == 2
6559             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6560         /* First vertical edge is different in MBAFF frames
6561          * There are 8 different bS to compute and 2 different Qp
6562          */
6563         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6564         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6565         int16_t bS[8];
6566         int qp[2];
6567         int bqp[2];
6568         int rqp[2];
6569         int mb_qp, mbn0_qp, mbn1_qp;
6570         int i;
6571         first_vertical_edge_done = 1;
6572
6573         if( IS_INTRA(mb_type) )
6574             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6575         else {
6576             for( i = 0; i < 8; i++ ) {
6577                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6578
6579                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6580                     bS[i] = 4;
6581                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6582                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6583                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6584                     bS[i] = 2;
6585                 else
6586                     bS[i] = 1;
6587             }
6588         }
6589
6590         mb_qp = s->current_picture.qscale_table[mb_xy];
6591         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6592         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6593         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6594         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6595                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6596         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6597                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6598         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6599         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6600                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6601         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6602                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6603
6604         /* Filter edge */
6605         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6606         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6607         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6608         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6609         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6610     }
6611     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6612     for( dir = 0; dir < 2; dir++ )
6613     {
6614         int edge;
6615         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6616         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6617         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6618
6619         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6620                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6621         // how often to recheck mv-based bS when iterating between edges
6622         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6623                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6624         // how often to recheck mv-based bS when iterating along each edge
6625         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6626
6627         if (first_vertical_edge_done) {
6628             start = 1;
6629             first_vertical_edge_done = 0;
6630         }
6631
6632         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6633             start = 1;
6634
6635         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6636             && !IS_INTERLACED(mb_type)
6637             && IS_INTERLACED(mbm_type)
6638             ) {
6639             // This is a special case in the norm where the filtering must
6640             // be done twice (one each of the field) even if we are in a
6641             // frame macroblock.
6642             //
6643             static const int nnz_idx[4] = {4,5,6,3};
6644             unsigned int tmp_linesize   = 2 *   linesize;
6645             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6646             int mbn_xy = mb_xy - 2 * s->mb_stride;
6647             int qp;
6648             int i, j;
6649             int16_t bS[4];
6650
6651             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6652                 if( IS_INTRA(mb_type) ||
6653                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6654                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6655                 } else {
6656                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6657                     for( i = 0; i < 4; i++ ) {
6658                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6659                             mbn_nnz[nnz_idx[i]] != 0 )
6660                             bS[i] = 2;
6661                         else
6662                             bS[i] = 1;
6663                     }
6664                 }
6665                 // Do not use s->qscale as luma quantizer because it has not the same
6666                 // value in IPCM macroblocks.
6667                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6668                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6669                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6670                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6671                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6672                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6673                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6674                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6675             }
6676
6677             start = 1;
6678         }
6679
6680         /* Calculate bS */
6681         for( edge = start; edge < edges; edge++ ) {
6682             /* mbn_xy: neighbor macroblock */
6683             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6684             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6685             int16_t bS[4];
6686             int qp;
6687
6688             if( (edge&1) && IS_8x8DCT(mb_type) )
6689                 continue;
6690
6691             if( IS_INTRA(mb_type) ||
6692                 IS_INTRA(mbn_type) ) {
6693                 int value;
6694                 if (edge == 0) {
6695                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6696                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6697                     ) {
6698                         value = 4;
6699                     } else {
6700                         value = 3;
6701                     }
6702                 } else {
6703                     value = 3;
6704                 }
6705                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6706             } else {
6707                 int i, l;
6708                 int mv_done;
6709
6710                 if( edge & mask_edge ) {
6711                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6712                     mv_done = 1;
6713                 }
6714                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6715                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6716                     mv_done = 1;
6717                 }
6718                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6719                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6720                     int bn_idx= b_idx - (dir ? 8:1);
6721                     int v = 0;
6722                     for( l = 0; !v && l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6723                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6724                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6725                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6726                     }
6727                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6728                     mv_done = 1;
6729                 }
6730                 else
6731                     mv_done = 0;
6732
6733                 for( i = 0; i < 4; i++ ) {
6734                     int x = dir == 0 ? edge : i;
6735                     int y = dir == 0 ? i    : edge;
6736                     int b_idx= 8 + 4 + x + 8*y;
6737                     int bn_idx= b_idx - (dir ? 8:1);
6738
6739                     if( h->non_zero_count_cache[b_idx] != 0 ||
6740                         h->non_zero_count_cache[bn_idx] != 0 ) {
6741                         bS[i] = 2;
6742                     }
6743                     else if(!mv_done)
6744                     {
6745                         bS[i] = 0;
6746                         for( l = 0; l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6747                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6748                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6749                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6750                                 bS[i] = 1;
6751                                 break;
6752                             }
6753                         }
6754                     }
6755                 }
6756
6757                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6758                     continue;
6759             }
6760
6761             /* Filter edge */
6762             // Do not use s->qscale as luma quantizer because it has not the same
6763             // value in IPCM macroblocks.
6764             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6765             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6766             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6767             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6768             if( dir == 0 ) {
6769                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6770                 if( (edge&1) == 0 ) {
6771                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6772                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6773                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6774                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6775                 }
6776             } else {
6777                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6778                 if( (edge&1) == 0 ) {
6779                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6780                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6781                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6782                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6783                 }
6784             }
6785         }
6786     }
6787 }
6788
6789 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6790     MpegEncContext * const s = &h->s;
6791     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6792
6793     s->mb_skip_run= -1;
6794
6795     if( h->pps.cabac ) {
6796         int i;
6797
6798         /* realign */
6799         align_get_bits( &s->gb );
6800
6801         /* init cabac */
6802         ff_init_cabac_states( &h->cabac);
6803         ff_init_cabac_decoder( &h->cabac,
6804                                s->gb.buffer + get_bits_count(&s->gb)/8,
6805                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6806         /* calculate pre-state */
6807         for( i= 0; i < 460; i++ ) {
6808             int pre;
6809             if( h->slice_type == FF_I_TYPE )
6810                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6811             else
6812                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6813
6814             if( pre <= 63 )
6815                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6816             else
6817                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6818         }
6819
6820         for(;;){
6821 //START_TIMER
6822             int ret = decode_mb_cabac(h);
6823             int eos;
6824 //STOP_TIMER("decode_mb_cabac")
6825
6826             if(ret>=0) hl_decode_mb(h);
6827
6828             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6829                 s->mb_y++;
6830
6831                 if(ret>=0) ret = decode_mb_cabac(h);
6832
6833                 if(ret>=0) hl_decode_mb(h);
6834                 s->mb_y--;
6835             }
6836             eos = get_cabac_terminate( &h->cabac );
6837
6838             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6839                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6840                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6841                 return -1;
6842             }
6843
6844             if( ++s->mb_x >= s->mb_width ) {
6845                 s->mb_x = 0;
6846                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6847                 ++s->mb_y;
6848                 if(FIELD_OR_MBAFF_PICTURE) {
6849                     ++s->mb_y;
6850                 }
6851             }
6852
6853             if( eos || s->mb_y >= s->mb_height ) {
6854                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6855                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6856                 return 0;
6857             }
6858         }
6859
6860     } else {
6861         for(;;){
6862             int ret = decode_mb_cavlc(h);
6863
6864             if(ret>=0) hl_decode_mb(h);
6865
6866             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6867                 s->mb_y++;
6868                 ret = decode_mb_cavlc(h);
6869
6870                 if(ret>=0) hl_decode_mb(h);
6871                 s->mb_y--;
6872             }
6873
6874             if(ret<0){
6875                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6876                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6877
6878                 return -1;
6879             }
6880
6881             if(++s->mb_x >= s->mb_width){
6882                 s->mb_x=0;
6883                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6884                 ++s->mb_y;
6885                 if(FIELD_OR_MBAFF_PICTURE) {
6886                     ++s->mb_y;
6887                 }
6888                 if(s->mb_y >= s->mb_height){
6889                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6890
6891                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6892                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6893
6894                         return 0;
6895                     }else{
6896                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6897
6898                         return -1;
6899                     }
6900                 }
6901             }
6902
6903             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6904                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6905                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6906                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6907
6908                     return 0;
6909                 }else{
6910                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6911
6912                     return -1;
6913                 }
6914             }
6915         }
6916     }
6917
6918 #if 0
6919     for(;s->mb_y < s->mb_height; s->mb_y++){
6920         for(;s->mb_x < s->mb_width; s->mb_x++){
6921             int ret= decode_mb(h);
6922
6923             hl_decode_mb(h);
6924
6925             if(ret<0){
6926                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6927                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6928
6929                 return -1;
6930             }
6931
6932             if(++s->mb_x >= s->mb_width){
6933                 s->mb_x=0;
6934                 if(++s->mb_y >= s->mb_height){
6935                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6936                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6937
6938                         return 0;
6939                     }else{
6940                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6941
6942                         return -1;
6943                     }
6944                 }
6945             }
6946
6947             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6948                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6949                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6950
6951                     return 0;
6952                 }else{
6953                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6954
6955                     return -1;
6956                 }
6957             }
6958         }
6959         s->mb_x=0;
6960         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6961     }
6962 #endif
6963     return -1; //not reached
6964 }
6965
6966 static int decode_unregistered_user_data(H264Context *h, int size){
6967     MpegEncContext * const s = &h->s;
6968     uint8_t user_data[16+256];
6969     int e, build, i;
6970
6971     if(size<16)
6972         return -1;
6973
6974     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6975         user_data[i]= get_bits(&s->gb, 8);
6976     }
6977
6978     user_data[i]= 0;
6979     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6980     if(e==1 && build>=0)
6981         h->x264_build= build;
6982
6983     if(s->avctx->debug & FF_DEBUG_BUGS)
6984         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6985
6986     for(; i<size; i++)
6987         skip_bits(&s->gb, 8);
6988
6989     return 0;
6990 }
6991
6992 static int decode_sei(H264Context *h){
6993     MpegEncContext * const s = &h->s;
6994
6995     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6996         int size, type;
6997
6998         type=0;
6999         do{
7000             type+= show_bits(&s->gb, 8);
7001         }while(get_bits(&s->gb, 8) == 255);
7002
7003         size=0;
7004         do{
7005             size+= show_bits(&s->gb, 8);
7006         }while(get_bits(&s->gb, 8) == 255);
7007
7008         switch(type){
7009         case 5:
7010             if(decode_unregistered_user_data(h, size) < 0)
7011                 return -1;
7012             break;
7013         default:
7014             skip_bits(&s->gb, 8*size);
7015         }
7016
7017         //FIXME check bits here
7018         align_get_bits(&s->gb);
7019     }
7020
7021     return 0;
7022 }
7023
7024 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7025     MpegEncContext * const s = &h->s;
7026     int cpb_count, i;
7027     cpb_count = get_ue_golomb(&s->gb) + 1;
7028     get_bits(&s->gb, 4); /* bit_rate_scale */
7029     get_bits(&s->gb, 4); /* cpb_size_scale */
7030     for(i=0; i<cpb_count; i++){
7031         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7032         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7033         get_bits1(&s->gb);     /* cbr_flag */
7034     }
7035     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7036     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7037     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7038     get_bits(&s->gb, 5); /* time_offset_length */
7039 }
7040
7041 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7042     MpegEncContext * const s = &h->s;
7043     int aspect_ratio_info_present_flag;
7044     unsigned int aspect_ratio_idc;
7045     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7046
7047     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7048
7049     if( aspect_ratio_info_present_flag ) {
7050         aspect_ratio_idc= get_bits(&s->gb, 8);
7051         if( aspect_ratio_idc == EXTENDED_SAR ) {
7052             sps->sar.num= get_bits(&s->gb, 16);
7053             sps->sar.den= get_bits(&s->gb, 16);
7054         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7055             sps->sar=  pixel_aspect[aspect_ratio_idc];
7056         }else{
7057             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7058             return -1;
7059         }
7060     }else{
7061         sps->sar.num=
7062         sps->sar.den= 0;
7063     }
7064 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7065
7066     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7067         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7068     }
7069
7070     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7071         get_bits(&s->gb, 3);    /* video_format */
7072         get_bits1(&s->gb);      /* video_full_range_flag */
7073         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7074             get_bits(&s->gb, 8); /* colour_primaries */
7075             get_bits(&s->gb, 8); /* transfer_characteristics */
7076             get_bits(&s->gb, 8); /* matrix_coefficients */
7077         }
7078     }
7079
7080     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7081         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7082         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7083     }
7084
7085     sps->timing_info_present_flag = get_bits1(&s->gb);
7086     if(sps->timing_info_present_flag){
7087         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7088         sps->time_scale = get_bits_long(&s->gb, 32);
7089         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7090     }
7091
7092     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7093     if(nal_hrd_parameters_present_flag)
7094         decode_hrd_parameters(h, sps);
7095     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7096     if(vcl_hrd_parameters_present_flag)
7097         decode_hrd_parameters(h, sps);
7098     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7099         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7100     get_bits1(&s->gb);         /* pic_struct_present_flag */
7101
7102     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7103     if(sps->bitstream_restriction_flag){
7104         unsigned int num_reorder_frames;
7105         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7106         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7107         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7108         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7109         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7110         num_reorder_frames= get_ue_golomb(&s->gb);
7111         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7112
7113         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7114             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7115             return -1;
7116         }
7117
7118         sps->num_reorder_frames= num_reorder_frames;
7119     }
7120
7121     return 0;
7122 }
7123
7124 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7125                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7126     MpegEncContext * const s = &h->s;
7127     int i, last = 8, next = 8;
7128     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7129     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7130         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7131     else
7132     for(i=0;i<size;i++){
7133         if(next)
7134             next = (last + get_se_golomb(&s->gb)) & 0xff;
7135         if(!i && !next){ /* matrix not written, we use the preset one */
7136             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7137             break;
7138         }
7139         last = factors[scan[i]] = next ? next : last;
7140     }
7141 }
7142
7143 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7144                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7145     MpegEncContext * const s = &h->s;
7146     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7147     const uint8_t *fallback[4] = {
7148         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7149         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7150         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7151         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7152     };
7153     if(get_bits1(&s->gb)){
7154         sps->scaling_matrix_present |= is_sps;
7155         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7156         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7157         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7158         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7159         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7160         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7161         if(is_sps || pps->transform_8x8_mode){
7162             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7163             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7164         }
7165     } else if(fallback_sps) {
7166         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7167         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7168     }
7169 }
7170
7171 /**
7172  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7173  */
7174 static void *
7175 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7176                     const size_t size, const char *name)
7177 {
7178     if(id>=max) {
7179         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7180         return NULL;
7181     }
7182
7183     if(!vec[id]) {
7184         vec[id] = av_mallocz(size);
7185         if(vec[id] == NULL)
7186             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7187     }
7188     return vec[id];
7189 }
7190
7191 static inline int decode_seq_parameter_set(H264Context *h){
7192     MpegEncContext * const s = &h->s;
7193     int profile_idc, level_idc;
7194     unsigned int sps_id, tmp, mb_width, mb_height;
7195     int i;
7196     SPS *sps;
7197
7198     profile_idc= get_bits(&s->gb, 8);
7199     get_bits1(&s->gb);   //constraint_set0_flag
7200     get_bits1(&s->gb);   //constraint_set1_flag
7201     get_bits1(&s->gb);   //constraint_set2_flag
7202     get_bits1(&s->gb);   //constraint_set3_flag
7203     get_bits(&s->gb, 4); // reserved
7204     level_idc= get_bits(&s->gb, 8);
7205     sps_id= get_ue_golomb(&s->gb);
7206
7207     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7208     if(sps == NULL)
7209         return -1;
7210
7211     sps->profile_idc= profile_idc;
7212     sps->level_idc= level_idc;
7213
7214     if(sps->profile_idc >= 100){ //high profile
7215         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7216             get_bits1(&s->gb);  //residual_color_transform_flag
7217         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7218         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7219         sps->transform_bypass = get_bits1(&s->gb);
7220         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7221     }else
7222         sps->scaling_matrix_present = 0;
7223
7224     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7225     sps->poc_type= get_ue_golomb(&s->gb);
7226
7227     if(sps->poc_type == 0){ //FIXME #define
7228         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7229     } else if(sps->poc_type == 1){//FIXME #define
7230         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7231         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7232         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7233         tmp= get_ue_golomb(&s->gb);
7234
7235         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7236             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7237             return -1;
7238         }
7239         sps->poc_cycle_length= tmp;
7240
7241         for(i=0; i<sps->poc_cycle_length; i++)
7242             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7243     }else if(sps->poc_type != 2){
7244         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7245         return -1;
7246     }
7247
7248     tmp= get_ue_golomb(&s->gb);
7249     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7250         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7251         return -1;
7252     }
7253     sps->ref_frame_count= tmp;
7254     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7255     mb_width= get_ue_golomb(&s->gb) + 1;
7256     mb_height= get_ue_golomb(&s->gb) + 1;
7257     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7258        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7259         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7260         return -1;
7261     }
7262     sps->mb_width = mb_width;
7263     sps->mb_height= mb_height;
7264
7265     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7266     if(!sps->frame_mbs_only_flag)
7267         sps->mb_aff= get_bits1(&s->gb);
7268     else
7269         sps->mb_aff= 0;
7270
7271     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7272
7273 #ifndef ALLOW_INTERLACE
7274     if(sps->mb_aff)
7275         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7276 #endif
7277     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7278         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7279
7280     sps->crop= get_bits1(&s->gb);
7281     if(sps->crop){
7282         sps->crop_left  = get_ue_golomb(&s->gb);
7283         sps->crop_right = get_ue_golomb(&s->gb);
7284         sps->crop_top   = get_ue_golomb(&s->gb);
7285         sps->crop_bottom= get_ue_golomb(&s->gb);
7286         if(sps->crop_left || sps->crop_top){
7287             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7288         }
7289         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7290             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7291         }
7292     }else{
7293         sps->crop_left  =
7294         sps->crop_right =
7295         sps->crop_top   =
7296         sps->crop_bottom= 0;
7297     }
7298
7299     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7300     if( sps->vui_parameters_present_flag )
7301         decode_vui_parameters(h, sps);
7302
7303     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7304         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7305                sps_id, sps->profile_idc, sps->level_idc,
7306                sps->poc_type,
7307                sps->ref_frame_count,
7308                sps->mb_width, sps->mb_height,
7309                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7310                sps->direct_8x8_inference_flag ? "8B8" : "",
7311                sps->crop_left, sps->crop_right,
7312                sps->crop_top, sps->crop_bottom,
7313                sps->vui_parameters_present_flag ? "VUI" : ""
7314                );
7315     }
7316     return 0;
7317 }
7318
7319 static void
7320 build_qp_table(PPS *pps, int t, int index)
7321 {
7322     int i;
7323     for(i = 0; i < 255; i++)
7324         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7325 }
7326
7327 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7328     MpegEncContext * const s = &h->s;
7329     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7330     PPS *pps;
7331
7332     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7333     if(pps == NULL)
7334         return -1;
7335
7336     tmp= get_ue_golomb(&s->gb);
7337     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7338         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7339         return -1;
7340     }
7341     pps->sps_id= tmp;
7342
7343     pps->cabac= get_bits1(&s->gb);
7344     pps->pic_order_present= get_bits1(&s->gb);
7345     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7346     if(pps->slice_group_count > 1 ){
7347         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7348         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7349         switch(pps->mb_slice_group_map_type){
7350         case 0:
7351 #if 0
7352 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7353 |    run_length[ i ]                                |1  |ue(v)   |
7354 #endif
7355             break;
7356         case 2:
7357 #if 0
7358 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7359 |{                                                  |   |        |
7360 |    top_left_mb[ i ]                               |1  |ue(v)   |
7361 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7362 |   }                                               |   |        |
7363 #endif
7364             break;
7365         case 3:
7366         case 4:
7367         case 5:
7368 #if 0
7369 |   slice_group_change_direction_flag               |1  |u(1)    |
7370 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7371 #endif
7372             break;
7373         case 6:
7374 #if 0
7375 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7376 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7377 |)                                                  |   |        |
7378 |    slice_group_id[ i ]                            |1  |u(v)    |
7379 #endif
7380             break;
7381         }
7382     }
7383     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7384     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7385     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7386         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7387         pps->ref_count[0]= pps->ref_count[1]= 1;
7388         return -1;
7389     }
7390
7391     pps->weighted_pred= get_bits1(&s->gb);
7392     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7393     pps->init_qp= get_se_golomb(&s->gb) + 26;
7394     pps->init_qs= get_se_golomb(&s->gb) + 26;
7395     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7396     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7397     pps->constrained_intra_pred= get_bits1(&s->gb);
7398     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7399
7400     pps->transform_8x8_mode= 0;
7401     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7402     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7403     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7404
7405     if(get_bits_count(&s->gb) < bit_length){
7406         pps->transform_8x8_mode= get_bits1(&s->gb);
7407         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7408         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7409     } else {
7410         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7411     }
7412
7413     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7414     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7415         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7416         h->pps.chroma_qp_diff= 1;
7417     } else
7418         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7419
7420     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7421         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7422                pps_id, pps->sps_id,
7423                pps->cabac ? "CABAC" : "CAVLC",
7424                pps->slice_group_count,
7425                pps->ref_count[0], pps->ref_count[1],
7426                pps->weighted_pred ? "weighted" : "",
7427                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7428                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7429                pps->constrained_intra_pred ? "CONSTR" : "",
7430                pps->redundant_pic_cnt_present ? "REDU" : "",
7431                pps->transform_8x8_mode ? "8x8DCT" : ""
7432                );
7433     }
7434
7435     return 0;
7436 }
7437
7438 /**
7439  * Call decode_slice() for each context.
7440  *
7441  * @param h h264 master context
7442  * @param context_count number of contexts to execute
7443  */
7444 static void execute_decode_slices(H264Context *h, int context_count){
7445     MpegEncContext * const s = &h->s;
7446     AVCodecContext * const avctx= s->avctx;
7447     H264Context *hx;
7448     int i;
7449
7450     if(context_count == 1) {
7451         decode_slice(avctx, h);
7452     } else {
7453         for(i = 1; i < context_count; i++) {
7454             hx = h->thread_context[i];
7455             hx->s.error_resilience = avctx->error_resilience;
7456             hx->s.error_count = 0;
7457         }
7458
7459         avctx->execute(avctx, (void *)decode_slice,
7460                        (void **)h->thread_context, NULL, context_count);
7461
7462         /* pull back stuff from slices to master context */
7463         hx = h->thread_context[context_count - 1];
7464         s->mb_x = hx->s.mb_x;
7465         s->mb_y = hx->s.mb_y;
7466         s->dropable = hx->s.dropable;
7467         s->picture_structure = hx->s.picture_structure;
7468         for(i = 1; i < context_count; i++)
7469             h->s.error_count += h->thread_context[i]->s.error_count;
7470     }
7471 }
7472
7473
7474 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7475     MpegEncContext * const s = &h->s;
7476     AVCodecContext * const avctx= s->avctx;
7477     int buf_index=0;
7478     H264Context *hx; ///< thread context
7479     int context_count = 0;
7480
7481     h->max_contexts = avctx->thread_count;
7482 #if 0
7483     int i;
7484     for(i=0; i<50; i++){
7485         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7486     }
7487 #endif
7488     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7489         h->current_slice = 0;
7490         if (!s->first_field)
7491             s->current_picture_ptr= NULL;
7492     }
7493
7494     for(;;){
7495         int consumed;
7496         int dst_length;
7497         int bit_length;
7498         const uint8_t *ptr;
7499         int i, nalsize = 0;
7500         int err;
7501
7502         if(h->is_avc) {
7503             if(buf_index >= buf_size) break;
7504             nalsize = 0;
7505             for(i = 0; i < h->nal_length_size; i++)
7506                 nalsize = (nalsize << 8) | buf[buf_index++];
7507             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7508                 if(nalsize == 1){
7509                     buf_index++;
7510                     continue;
7511                 }else{
7512                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7513                     break;
7514                 }
7515             }
7516         } else {
7517             // start code prefix search
7518             for(; buf_index + 3 < buf_size; buf_index++){
7519                 // This should always succeed in the first iteration.
7520                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7521                     break;
7522             }
7523
7524             if(buf_index+3 >= buf_size) break;
7525
7526             buf_index+=3;
7527         }
7528
7529         hx = h->thread_context[context_count];
7530
7531         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7532         if (ptr==NULL || dst_length < 0){
7533             return -1;
7534         }
7535         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7536             dst_length--;
7537         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7538
7539         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7540             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7541         }
7542
7543         if (h->is_avc && (nalsize != consumed)){
7544             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7545             consumed= nalsize;
7546         }
7547
7548         buf_index += consumed;
7549
7550         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7551            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7552             continue;
7553
7554       again:
7555         err = 0;
7556         switch(hx->nal_unit_type){
7557         case NAL_IDR_SLICE:
7558             if (h->nal_unit_type != NAL_IDR_SLICE) {
7559                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7560                 return -1;
7561             }
7562             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7563         case NAL_SLICE:
7564             init_get_bits(&hx->s.gb, ptr, bit_length);
7565             hx->intra_gb_ptr=
7566             hx->inter_gb_ptr= &hx->s.gb;
7567             hx->s.data_partitioning = 0;
7568
7569             if((err = decode_slice_header(hx, h)))
7570                break;
7571
7572             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7573             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7574                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7575                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7576                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7577                && avctx->skip_frame < AVDISCARD_ALL)
7578                 context_count++;
7579             break;
7580         case NAL_DPA:
7581             init_get_bits(&hx->s.gb, ptr, bit_length);
7582             hx->intra_gb_ptr=
7583             hx->inter_gb_ptr= NULL;
7584             hx->s.data_partitioning = 1;
7585
7586             err = decode_slice_header(hx, h);
7587             break;
7588         case NAL_DPB:
7589             init_get_bits(&hx->intra_gb, ptr, bit_length);
7590             hx->intra_gb_ptr= &hx->intra_gb;
7591             break;
7592         case NAL_DPC:
7593             init_get_bits(&hx->inter_gb, ptr, bit_length);
7594             hx->inter_gb_ptr= &hx->inter_gb;
7595
7596             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7597                && s->context_initialized
7598                && s->hurry_up < 5
7599                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7600                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7601                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7602                && avctx->skip_frame < AVDISCARD_ALL)
7603                 context_count++;
7604             break;
7605         case NAL_SEI:
7606             init_get_bits(&s->gb, ptr, bit_length);
7607             decode_sei(h);
7608             break;
7609         case NAL_SPS:
7610             init_get_bits(&s->gb, ptr, bit_length);
7611             decode_seq_parameter_set(h);
7612
7613             if(s->flags& CODEC_FLAG_LOW_DELAY)
7614                 s->low_delay=1;
7615
7616             if(avctx->has_b_frames < 2)
7617                 avctx->has_b_frames= !s->low_delay;
7618             break;
7619         case NAL_PPS:
7620             init_get_bits(&s->gb, ptr, bit_length);
7621
7622             decode_picture_parameter_set(h, bit_length);
7623
7624             break;
7625         case NAL_AUD:
7626         case NAL_END_SEQUENCE:
7627         case NAL_END_STREAM:
7628         case NAL_FILLER_DATA:
7629         case NAL_SPS_EXT:
7630         case NAL_AUXILIARY_SLICE:
7631             break;
7632         default:
7633             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7634         }
7635
7636         if(context_count == h->max_contexts) {
7637             execute_decode_slices(h, context_count);
7638             context_count = 0;
7639         }
7640
7641         if (err < 0)
7642             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7643         else if(err == 1) {
7644             /* Slice could not be decoded in parallel mode, copy down
7645              * NAL unit stuff to context 0 and restart. Note that
7646              * rbsp_buffer is not transfered, but since we no longer
7647              * run in parallel mode this should not be an issue. */
7648             h->nal_unit_type = hx->nal_unit_type;
7649             h->nal_ref_idc   = hx->nal_ref_idc;
7650             hx = h;
7651             goto again;
7652         }
7653     }
7654     if(context_count)
7655         execute_decode_slices(h, context_count);
7656     return buf_index;
7657 }
7658
7659 /**
7660  * returns the number of bytes consumed for building the current frame
7661  */
7662 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7663     if(s->flags&CODEC_FLAG_TRUNCATED){
7664         pos -= s->parse_context.last_index;
7665         if(pos<0) pos=0; // FIXME remove (unneeded?)
7666
7667         return pos;
7668     }else{
7669         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7670         if(pos+10>buf_size) pos=buf_size; // oops ;)
7671
7672         return pos;
7673     }
7674 }
7675
7676 static int decode_frame(AVCodecContext *avctx,
7677                              void *data, int *data_size,
7678                              const uint8_t *buf, int buf_size)
7679 {
7680     H264Context *h = avctx->priv_data;
7681     MpegEncContext *s = &h->s;
7682     AVFrame *pict = data;
7683     int buf_index;
7684
7685     s->flags= avctx->flags;
7686     s->flags2= avctx->flags2;
7687
7688     if(s->flags&CODEC_FLAG_TRUNCATED){
7689         const int next= ff_h264_find_frame_end(h, buf, buf_size);
7690         assert((buf_size > 0) || (next == END_NOT_FOUND));
7691
7692         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7693           return buf_size;
7694 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7695     }
7696
7697    /* no supplementary picture */
7698     if (buf_size == 0) {
7699         Picture *out;
7700         int i, out_idx;
7701
7702 //FIXME factorize this with the output code below
7703         out = h->delayed_pic[0];
7704         out_idx = 0;
7705         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7706             if(h->delayed_pic[i]->poc < out->poc){
7707                 out = h->delayed_pic[i];
7708                 out_idx = i;
7709             }
7710
7711         for(i=out_idx; h->delayed_pic[i]; i++)
7712             h->delayed_pic[i] = h->delayed_pic[i+1];
7713
7714         if(out){
7715             *data_size = sizeof(AVFrame);
7716             *pict= *(AVFrame*)out;
7717         }
7718
7719         return 0;
7720     }
7721
7722     if(h->is_avc && !h->got_avcC) {
7723         int i, cnt, nalsize;
7724         unsigned char *p = avctx->extradata;
7725         if(avctx->extradata_size < 7) {
7726             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7727             return -1;
7728         }
7729         if(*p != 1) {
7730             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7731             return -1;
7732         }
7733         /* sps and pps in the avcC always have length coded with 2 bytes,
7734            so put a fake nal_length_size = 2 while parsing them */
7735         h->nal_length_size = 2;
7736         // Decode sps from avcC
7737         cnt = *(p+5) & 0x1f; // Number of sps
7738         p += 6;
7739         for (i = 0; i < cnt; i++) {
7740             nalsize = AV_RB16(p) + 2;
7741             if(decode_nal_units(h, p, nalsize) < 0) {
7742                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7743                 return -1;
7744             }
7745             p += nalsize;
7746         }
7747         // Decode pps from avcC
7748         cnt = *(p++); // Number of pps
7749         for (i = 0; i < cnt; i++) {
7750             nalsize = AV_RB16(p) + 2;
7751             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7752                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7753                 return -1;
7754             }
7755             p += nalsize;
7756         }
7757         // Now store right nal length size, that will be use to parse all other nals
7758         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7759         // Do not reparse avcC
7760         h->got_avcC = 1;
7761     }
7762
7763     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7764         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7765             return -1;
7766     }
7767
7768     buf_index=decode_nal_units(h, buf, buf_size);
7769     if(buf_index < 0)
7770         return -1;
7771
7772     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7773         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7774         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7775         return -1;
7776     }
7777
7778     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7779         Picture *out = s->current_picture_ptr;
7780         Picture *cur = s->current_picture_ptr;
7781         Picture *prev = h->delayed_output_pic;
7782         int i, pics, cross_idr, out_of_order, out_idx;
7783
7784         s->mb_y= 0;
7785
7786         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7787         s->current_picture_ptr->pict_type= s->pict_type;
7788
7789         h->prev_frame_num_offset= h->frame_num_offset;
7790         h->prev_frame_num= h->frame_num;
7791         if(!s->dropable) {
7792             h->prev_poc_msb= h->poc_msb;
7793             h->prev_poc_lsb= h->poc_lsb;
7794             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7795         }
7796
7797         /*
7798          * FIXME: Error handling code does not seem to support interlaced
7799          * when slices span multiple rows
7800          * The ff_er_add_slice calls don't work right for bottom
7801          * fields; they cause massive erroneous error concealing
7802          * Error marking covers both fields (top and bottom).
7803          * This causes a mismatched s->error_count
7804          * and a bad error table. Further, the error count goes to
7805          * INT_MAX when called for bottom field, because mb_y is
7806          * past end by one (callers fault) and resync_mb_y != 0
7807          * causes problems for the first MB line, too.
7808          */
7809         if (!FIELD_PICTURE)
7810             ff_er_frame_end(s);
7811
7812         MPV_frame_end(s);
7813
7814         if (s->first_field) {
7815             /* Wait for second field. */
7816             *data_size = 0;
7817
7818         } else {
7819             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7820             /* Derive top_field_first from field pocs. */
7821             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7822
7823         //FIXME do something with unavailable reference frames
7824
7825 #if 0 //decode order
7826             *data_size = sizeof(AVFrame);
7827 #else
7828             /* Sort B-frames into display order */
7829
7830             if(h->sps.bitstream_restriction_flag
7831                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7832                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7833                 s->low_delay = 0;
7834             }
7835
7836             pics = 0;
7837             while(h->delayed_pic[pics]) pics++;
7838
7839             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7840
7841             h->delayed_pic[pics++] = cur;
7842             if(cur->reference == 0)
7843                 cur->reference = DELAYED_PIC_REF;
7844
7845             cross_idr = 0;
7846             for(i=0; h->delayed_pic[i]; i++)
7847                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7848                     cross_idr = 1;
7849
7850             out = h->delayed_pic[0];
7851             out_idx = 0;
7852             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7853                 if(h->delayed_pic[i]->poc < out->poc){
7854                     out = h->delayed_pic[i];
7855                     out_idx = i;
7856                 }
7857
7858             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7859             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7860                 { }
7861             else if(prev && pics <= s->avctx->has_b_frames)
7862                 out = prev;
7863             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7864                || (s->low_delay &&
7865                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7866                  || cur->pict_type == FF_B_TYPE)))
7867             {
7868                 s->low_delay = 0;
7869                 s->avctx->has_b_frames++;
7870                 out = prev;
7871             }
7872             else if(out_of_order)
7873                 out = prev;
7874
7875             if(out_of_order || pics > s->avctx->has_b_frames){
7876                 for(i=out_idx; h->delayed_pic[i]; i++)
7877                     h->delayed_pic[i] = h->delayed_pic[i+1];
7878             }
7879
7880             if(prev == out)
7881                 *data_size = 0;
7882             else
7883                 *data_size = sizeof(AVFrame);
7884             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7885                 prev->reference = 0;
7886             h->delayed_output_pic = out;
7887 #endif
7888
7889             if(out)
7890                 *pict= *(AVFrame*)out;
7891             else
7892                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7893         }
7894     }
7895
7896     assert(pict->data[0] || !*data_size);
7897     ff_print_debug_info(s, pict);
7898 //printf("out %d\n", (int)pict->data[0]);
7899 #if 0 //?
7900
7901     /* Return the Picture timestamp as the frame number */
7902     /* we subtract 1 because it is added on utils.c     */
7903     avctx->frame_number = s->picture_number - 1;
7904 #endif
7905     return get_consumed_bytes(s, buf_index, buf_size);
7906 }
7907 #if 0
7908 static inline void fill_mb_avail(H264Context *h){
7909     MpegEncContext * const s = &h->s;
7910     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7911
7912     if(s->mb_y){
7913         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7914         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7915         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7916     }else{
7917         h->mb_avail[0]=
7918         h->mb_avail[1]=
7919         h->mb_avail[2]= 0;
7920     }
7921     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7922     h->mb_avail[4]= 1; //FIXME move out
7923     h->mb_avail[5]= 0; //FIXME move out
7924 }
7925 #endif
7926
7927 #ifdef TEST
7928 #undef printf
7929 #undef random
7930 #define COUNT 8000
7931 #define SIZE (COUNT*40)
7932 int main(void){
7933     int i;
7934     uint8_t temp[SIZE];
7935     PutBitContext pb;
7936     GetBitContext gb;
7937 //    int int_temp[10000];
7938     DSPContext dsp;
7939     AVCodecContext avctx;
7940
7941     dsputil_init(&dsp, &avctx);
7942
7943     init_put_bits(&pb, temp, SIZE);
7944     printf("testing unsigned exp golomb\n");
7945     for(i=0; i<COUNT; i++){
7946         START_TIMER
7947         set_ue_golomb(&pb, i);
7948         STOP_TIMER("set_ue_golomb");
7949     }
7950     flush_put_bits(&pb);
7951
7952     init_get_bits(&gb, temp, 8*SIZE);
7953     for(i=0; i<COUNT; i++){
7954         int j, s;
7955
7956         s= show_bits(&gb, 24);
7957
7958         START_TIMER
7959         j= get_ue_golomb(&gb);
7960         if(j != i){
7961             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7962 //            return -1;
7963         }
7964         STOP_TIMER("get_ue_golomb");
7965     }
7966
7967
7968     init_put_bits(&pb, temp, SIZE);
7969     printf("testing signed exp golomb\n");
7970     for(i=0; i<COUNT; i++){
7971         START_TIMER
7972         set_se_golomb(&pb, i - COUNT/2);
7973         STOP_TIMER("set_se_golomb");
7974     }
7975     flush_put_bits(&pb);
7976
7977     init_get_bits(&gb, temp, 8*SIZE);
7978     for(i=0; i<COUNT; i++){
7979         int j, s;
7980
7981         s= show_bits(&gb, 24);
7982
7983         START_TIMER
7984         j= get_se_golomb(&gb);
7985         if(j != i - COUNT/2){
7986             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7987 //            return -1;
7988         }
7989         STOP_TIMER("get_se_golomb");
7990     }
7991
7992 #if 0
7993     printf("testing 4x4 (I)DCT\n");
7994
7995     DCTELEM block[16];
7996     uint8_t src[16], ref[16];
7997     uint64_t error= 0, max_error=0;
7998
7999     for(i=0; i<COUNT; i++){
8000         int j;
8001 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8002         for(j=0; j<16; j++){
8003             ref[j]= random()%255;
8004             src[j]= random()%255;
8005         }
8006
8007         h264_diff_dct_c(block, src, ref, 4);
8008
8009         //normalize
8010         for(j=0; j<16; j++){
8011 //            printf("%d ", block[j]);
8012             block[j]= block[j]*4;
8013             if(j&1) block[j]= (block[j]*4 + 2)/5;
8014             if(j&4) block[j]= (block[j]*4 + 2)/5;
8015         }
8016 //        printf("\n");
8017
8018         s->dsp.h264_idct_add(ref, block, 4);
8019 /*        for(j=0; j<16; j++){
8020             printf("%d ", ref[j]);
8021         }
8022         printf("\n");*/
8023
8024         for(j=0; j<16; j++){
8025             int diff= FFABS(src[j] - ref[j]);
8026
8027             error+= diff*diff;
8028             max_error= FFMAX(max_error, diff);
8029         }
8030     }
8031     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8032     printf("testing quantizer\n");
8033     for(qp=0; qp<52; qp++){
8034         for(i=0; i<16; i++)
8035             src1_block[i]= src2_block[i]= random()%255;
8036
8037     }
8038     printf("Testing NAL layer\n");
8039
8040     uint8_t bitstream[COUNT];
8041     uint8_t nal[COUNT*2];
8042     H264Context h;
8043     memset(&h, 0, sizeof(H264Context));
8044
8045     for(i=0; i<COUNT; i++){
8046         int zeros= i;
8047         int nal_length;
8048         int consumed;
8049         int out_length;
8050         uint8_t *out;
8051         int j;
8052
8053         for(j=0; j<COUNT; j++){
8054             bitstream[j]= (random() % 255) + 1;
8055         }
8056
8057         for(j=0; j<zeros; j++){
8058             int pos= random() % COUNT;
8059             while(bitstream[pos] == 0){
8060                 pos++;
8061                 pos %= COUNT;
8062             }
8063             bitstream[pos]=0;
8064         }
8065
8066         START_TIMER
8067
8068         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8069         if(nal_length<0){
8070             printf("encoding failed\n");
8071             return -1;
8072         }
8073
8074         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8075
8076         STOP_TIMER("NAL")
8077
8078         if(out_length != COUNT){
8079             printf("incorrect length %d %d\n", out_length, COUNT);
8080             return -1;
8081         }
8082
8083         if(consumed != nal_length){
8084             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8085             return -1;
8086         }
8087
8088         if(memcmp(bitstream, out, COUNT)){
8089             printf("mismatch\n");
8090             return -1;
8091         }
8092     }
8093 #endif
8094
8095     printf("Testing RBSP\n");
8096
8097
8098     return 0;
8099 }
8100 #endif /* TEST */
8101
8102
8103 static av_cold int decode_end(AVCodecContext *avctx)
8104 {
8105     H264Context *h = avctx->priv_data;
8106     MpegEncContext *s = &h->s;
8107
8108     av_freep(&h->rbsp_buffer[0]);
8109     av_freep(&h->rbsp_buffer[1]);
8110     free_tables(h); //FIXME cleanup init stuff perhaps
8111     MPV_common_end(s);
8112
8113 //    memset(h, 0, sizeof(H264Context));
8114
8115     return 0;
8116 }
8117
8118
8119 AVCodec h264_decoder = {
8120     "h264",
8121     CODEC_TYPE_VIDEO,
8122     CODEC_ID_H264,
8123     sizeof(H264Context),
8124     decode_init,
8125     NULL,
8126     decode_end,
8127     decode_frame,
8128     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8129     .flush= flush_dpb,
8130     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8131 };
8132
8133 #include "svq3.c"